Skip to content

Commit 93f995b

Browse files
committed
Process Native CPU only properties
1 parent 9937d02 commit 93f995b

File tree

5 files changed

+61
-20
lines changed

5 files changed

+61
-20
lines changed

source/adapters/native_cpu/enqueue.cpp

+6-9
Original file line numberDiff line numberDiff line change
@@ -138,13 +138,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
138138
#else
139139
bool isLocalSizeOne =
140140
ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1;
141-
if (isLocalSizeOne && ndr.GlobalSize[0] > numParallelThreads) {
142-
// If the local size is one, we make the assumption that we are running a
143-
// parallel_for over a sycl::range.
144-
// Todo: we could add compiler checks and
145-
// kernel properties for this (e.g. check that no barriers are called, no
146-
// local memory args).
147-
141+
if (isLocalSizeOne && !hKernel->isNDRangeKernel()) {
148142
// Todo: this assumes that dim 0 is the best dimension over which we want to
149143
// parallelize
150144

@@ -153,8 +147,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
153147
// divide the global range by the number of threads, set that as the local
154148
// size and peel everything else.
155149

156-
size_t new_num_work_groups_0 = numParallelThreads;
157-
size_t itemsPerThread = ndr.GlobalSize[0] / numParallelThreads;
150+
size_t new_num_work_groups_0 =
151+
ndr.GlobalSize[0] > numParallelThreads ? numParallelThreads : 1;
152+
size_t itemsPerThread = ndr.GlobalSize[0] > numParallelThreads
153+
? ndr.GlobalSize[0] / numParallelThreads
154+
: ndr.GlobalSize[0];
158155

159156
for (unsigned g2 = 0; g2 < numWG2; g2++) {
160157
for (unsigned g1 = 0; g1 < numWG1; g1++) {

source/adapters/native_cpu/kernel.cpp

+6-1
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,13 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
4747
if (auto MaxLIt = MaxLinMap.find(pKernelName); MaxLIt != MaxLinMap.end()) {
4848
MaxLinearWG = MaxLIt->second;
4949
}
50+
bool IsNDRangeKernel = false;
51+
if (auto isNDIt = hProgram->KernelIsNDRangeMD.find(pKernelName);
52+
isNDIt != hProgram->KernelIsNDRangeMD.end()) {
53+
IsNDRangeKernel = isNDIt->second;
54+
}
5055
kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f, ReqdWG, MaxWG,
51-
MaxLinearWG);
56+
MaxLinearWG, IsNDRangeKernel);
5257

5358
*phKernel = kernel;
5459

source/adapters/native_cpu/kernel.hpp

+7-7
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,12 @@ struct local_arg_info_t {
2828

2929
struct ur_kernel_handle_t_ : RefCounted {
3030

31-
ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
32-
nativecpu_task_t subhandler)
33-
: hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)} {}
34-
3531
ur_kernel_handle_t_(const ur_kernel_handle_t_ &other)
3632
: Args(other.Args), hProgram(other.hProgram), _name(other._name),
3733
_subhandler(other._subhandler), _localArgInfo(other._localArgInfo),
3834
_localMemPool(other._localMemPool),
3935
_localMemPoolSize(other._localMemPoolSize),
40-
ReqdWGSize(other.ReqdWGSize) {
36+
ReqdWGSize(other.ReqdWGSize), NDRangeKernel(other.NDRangeKernel) {
4137
incrementReferenceCount();
4238
}
4339

@@ -52,10 +48,11 @@ struct ur_kernel_handle_t_ : RefCounted {
5248
nativecpu_task_t subhandler,
5349
std::optional<native_cpu::WGSize_t> ReqdWGSize,
5450
std::optional<native_cpu::WGSize_t> MaxWGSize,
55-
std::optional<uint64_t> MaxLinearWGSize)
51+
std::optional<uint64_t> MaxLinearWGSize,
52+
bool isNDRangeKernel)
5653
: hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)},
5754
ReqdWGSize(ReqdWGSize), MaxWGSize(MaxWGSize),
58-
MaxLinearWGSize(MaxLinearWGSize) {}
55+
MaxLinearWGSize(MaxLinearWGSize), NDRangeKernel(isNDRangeKernel) {}
5956

6057
struct arguments {
6158
using args_index_t = std::vector<void *>;
@@ -162,10 +159,13 @@ struct ur_kernel_handle_t_ : RefCounted {
162159

163160
void addPtrArg(void *Ptr, size_t Index) { Args.addPtrArg(Index, Ptr); }
164161

162+
bool isNDRangeKernel() const { return NDRangeKernel; }
163+
165164
private:
166165
char *_localMemPool = nullptr;
167166
size_t _localMemPoolSize = 0;
168167
std::optional<native_cpu::WGSize_t> ReqdWGSize = std::nullopt;
169168
std::optional<native_cpu::WGSize_t> MaxWGSize = std::nullopt;
170169
std::optional<uint64_t> MaxLinearWGSize = std::nullopt;
170+
const bool NDRangeKernel = false;
171171
};

source/adapters/native_cpu/program.cpp

+14-2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
//
99
//===----------------------------------------------------------------------===//
1010

11+
#include "ur/ur.hpp"
1112
#include "ur_api.h"
1213

1314
#include "common.hpp"
@@ -99,14 +100,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
99100
}
100101
}
101102

102-
const nativecpu_entry *nativecpu_it =
103-
reinterpret_cast<const nativecpu_entry *>(pBinary);
103+
const nativecpu_program *program =
104+
reinterpret_cast<const nativecpu_program *>(pBinary);
105+
const nativecpu_entry *nativecpu_it = program->entries;
104106
while (nativecpu_it->kernel_ptr != nullptr) {
105107
hProgram->_kernels.insert(
106108
std::make_pair(nativecpu_it->kernelname, nativecpu_it->kernel_ptr));
107109
nativecpu_it++;
108110
}
109111

112+
// Process Native CPU specific properties
113+
const _pi_device_binary_property_set_struct *props = program->properties;
114+
for (auto prop = props->PropertiesBegin; prop != props->PropertiesEnd;
115+
prop++) {
116+
auto [Prefix, Tag] = splitMetadataName(prop->Name);
117+
if (Tag == "@is_nd_range") {
118+
hProgram->KernelIsNDRangeMD[Prefix] = prop->ValSize;
119+
}
120+
}
121+
110122
*phProgram = hProgram.release();
111123

112124
return UR_RESULT_SUCCESS;

source/adapters/native_cpu/program.hpp

+28-1
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,40 @@ struct ur_program_handle_t_ : RefCounted {
4141
std::unordered_map<std::string, native_cpu::WGSize_t>
4242
KernelMaxWorkGroupSizeMD;
4343
std::unordered_map<std::string, uint64_t> KernelMaxLinearWorkGroupSizeMD;
44+
std::unordered_map<std::string, bool> KernelIsNDRangeMD;
4445
};
4546

46-
// The nativecpu_entry struct is also defined as LLVM-IR in the
47+
// These structs are also defined as LLVM-IR in the
4748
// clang-offload-wrapper tool. The two definitions need to match,
4849
// therefore any change to this struct needs to be reflected in the
4950
// offload-wrapper.
51+
5052
struct nativecpu_entry {
5153
const char *kernelname;
5254
const unsigned char *kernel_ptr;
5355
};
56+
57+
typedef enum {
58+
PI_PROPERTY_TYPE_INT32,
59+
PI_PROPERTY_TYPE_STRING
60+
} pi_property_type;
61+
62+
struct _pi_device_binary_property_struct {
63+
char *Name;
64+
void *ValAddr;
65+
pi_property_type Type;
66+
uint64_t ValSize;
67+
};
68+
69+
// TODO These property structs are taken from clang-offload-wrapper,
70+
// perhaps we could define something that fits better our purposes?
71+
struct _pi_device_binary_property_set_struct {
72+
char *Name;
73+
_pi_device_binary_property_struct *PropertiesBegin;
74+
_pi_device_binary_property_struct *PropertiesEnd;
75+
};
76+
77+
struct nativecpu_program {
78+
nativecpu_entry *entries;
79+
_pi_device_binary_property_set_struct *properties;
80+
};

0 commit comments

Comments
 (0)