Skip to content

Commit

Permalink
Merge pull request #87 from Ralender/FixEdgeDetection
Browse files Browse the repository at this point in the history
[SYCL] Fix edge_detection and add it to the lit test-suite
  • Loading branch information
keryell authored Oct 1, 2020
2 parents 9a3f148 + f92a2bd commit 22603c6
Show file tree
Hide file tree
Showing 14 changed files with 74 additions and 105 deletions.
73 changes: 0 additions & 73 deletions llvm/lib/SYCL/InSPIRation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -348,78 +348,6 @@ struct InSPIRation : public ModulePass {
}
}

/// This currently looks through the arguments passed to the SSDM intrinsic
/// call and checks the instruction to see if it is an address space cast to
/// a generic, if it is, it will take the concrete segment of the cast and
/// replace the operand with it. It looks like the below example:
///
/// Before:
/// %10 = getelementptr inbounds %"struct._ZTSN2cl4sycl6xilinx15partition_
/// arrayIcLm9ENS1_9partition8completeILm0EEEEE.cl::sycl::xilinx::partition_
/// array", %"struct._ZTSN2cl4sycl6xilinx15partition_arrayIcLm9ENS1_9
/// partition8completeILm0EEEEE.cl::sycl::xilinx::partition_array"* %1,
/// i64 0, i32 0, i64 0
/// %11 = addrspacecast i8* %10 to i8 addrspace(4)*
/// call spir_func void (...) @_ssdm_SpecArrayPartition(i8 addrspace(4)* %11,
/// i64 0, i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i64 0,
/// i64 0), i32 0, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str.2,
/// i64 0, i64 0)) #4
///
/// After:
/// %10 = getelementptr inbounds %"struct._ZTSN2cl4sycl6xilinx15partition_
/// arrayIcLm9ENS1_9partition8completeILm0EEEEE.cl::sycl::xilinx::partition
/// _array", %"struct._ZTSN2cl4sycl6xilinx15partition_arrayIcLm9ENS1_9
/// partition8completeILm0EEEEE.cl::sycl::xilinx::partition_array"* %1,
/// i64 0, i32 0, i64 0
/// call spir_func void (...) @_ssdm_SpecArrayPartition(i8* %10, i64 0,
/// i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i64 0, i64 0),
/// i32 0, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str.2, i64 0,
/// i64 0)) #4
///
/// It's simply collapsing away the cast right now, this doesn't consider the
/// possibility of interactions with other possible address space casts that
/// depend on it (we simply replace all uses with the non-geneirc variant).
/// For now we hope that these are erased or eliminated by either of the AS
/// Fixer passes or DCE passes.
///
/// Note: Unsure how robust this is with the small sample size we currently
/// have to test against. So it's a WIP. If the situation becomes untenable
/// we can likely revert the accessor class back to it's prior form by
/// reverting this pull: https://github.com/intel/llvm/pull/348 (commit:
/// 609999c4e1aeca05aff010ce5e2eb08dde08fd69). This may cause address space
/// leakage however, but should result in more overall address space
/// consistency/stability due to the addition of more concrete address spaces.
void handleSpecArrayPartition(CallInst *CI) {
for (Use &Op : CI->operands()) {
if (Op->getType()->isPointerTy()) {
if (auto *ASC = dyn_cast<AddrSpaceCastInst>(Op)) {
if (ASC->getDestAddressSpace() == /*Generic AS*/ 4) {
Op.set(nullptr);
Op.set(ASC->getPointerOperand());
}
}
}
}
}

/// SSDM intrinsics are black boxes, the InferAddressSpace pass will not touch
/// them (this is in part due to the fact it doesn't deal with Calls and in
/// part because SSDMs are declared with no implementation and no arguments),
/// this will result in left over generic casts. This function is here
/// to deal with the cases of the left over generics caused by SSDM intrinsics
/// as if they're left in the compilation will fail.
///
/// In the future if we ever define an LLVM target backend similar to AMDGPU
/// and we end up with a lot of these edge cases we could move this to the
/// InferAddressSpaces pass and teach it to deal with these SSDM calls as
/// Intrinsics.
void ssdmAddressSpaceFix(Function &F) {
for (auto &I : instructions(F))
if (auto *Call = dyn_cast<CallInst>(&I))
if (Call->getIntrinsicID() == Intrinsic::sideeffect)
handleSpecArrayPartition(Call);
}

// Hopeful list/probably impractical asks for xocc:
// 1) Make XML generator/reader a little kinder towards arguments with no
// names if possible
Expand Down Expand Up @@ -486,7 +414,6 @@ struct InSPIRation : public ModulePass {
// It doesn't require application to the SPIR intrinsics as we're
// linking against the HLS SPIR library, which is already conformant.
giveNameToArguments(F);
ssdmAddressSpaceFix(F);
} else if (isTransitiveNonIntrinsicFunc(F)
&& F.isDeclaration()) {
// push back intrinsics to make sure we handle naming after changing the
Expand Down
12 changes: 12 additions & 0 deletions sycl/include/CL/sycl/detail/defines.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,15 @@
#else
#define __SYCL_INLINE_CONSTEXPR static constexpr
#endif

#if defined(__SYCL_DEVICE_ONLY__)
#define __SYCL_DEVICE_ANNOTATE(...) __attribute__((annotate(__VA_ARGS__)))
#else
#define __SYCL_DEVICE_ANNOTATE(...)
#endif

#if defined(__SYCL_DEVICE_ONLY__)
#define __SYCL_DEVICE_ADDRSPACE(AS) __attribute__((address_space(AS)))
#else
#define __SYCL_DEVICE_ADDRSPACE(AS)
#endif
8 changes: 1 addition & 7 deletions sycl/include/CL/sycl/xilinx/fpga/opt_decorate_func.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,6 @@
#include "CL/sycl/xilinx/fpga/ssdm_inst.hpp"
#include "CL/sycl/detail/defines.hpp"

#if defined(__SYCL_DEVICE_ONLY__)
#define SYCL_DEVICE_ANNOTATE(...) __attribute__((annotate(__VA_ARGS__)))
#else
#define SYCL_DEVICE_ANNOTATE(...)
#endif

__SYCL_INLINE_NAMESPACE(cl) {

namespace sycl::xilinx {
Expand Down Expand Up @@ -59,7 +53,7 @@ void dataflow(T functor) {
pipeline way.
*/
template <typename T>
SYCL_DEVICE_ANNOTATE("xilinx_pipeline") __attribute__((always_inline))
__SYCL_DEVICE_ANNOTATE("xilinx_pipeline") __attribute__((always_inline))
void pipeline(T functor) {
functor();
}
Expand Down
27 changes: 17 additions & 10 deletions sycl/include/CL/sycl/xilinx/fpga/partition_array.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,9 @@ namespace partition {

/// This fuction is currently empty but the LowerSYCLMetaData Pass will fill
/// it with the required IR.
SYCL_DEVICE_ANNOTATE("xilinx_partition_array") __attribute__((always_inline))
inline void xilinx_partition_array(void*, int, int, int) {}
template<typename Ptr>
__SYCL_DEVICE_ANNOTATE("xilinx_partition_array") __attribute__((always_inline))
inline void xilinx_partition_array(Ptr, int, int, int) {}

/** Represent a cyclic partition.
Expand Down Expand Up @@ -130,6 +131,7 @@ namespace partition {
*/
template <std::size_t PDim = 1>
struct complete {
static_assert(PDim >= 1, "array can't 0 dimension");
static constexpr auto partition_dim = PDim;
static constexpr auto partition_type = type::complete;
};
Expand Down Expand Up @@ -197,16 +199,21 @@ struct partition_array {
// Add the intrinsic according expressing to the target compiler the
// partitioning to use
if constexpr (partition_type == partition::type::cyclic)
partition::xilinx_partition_array(&(*this)[0], partition_type,
PartitionType::physical_mem_num,
PartitionType::partition_dim);
partition::xilinx_partition_array(
(ValueType __SYCL_DEVICE_ADDRSPACE(0 /*stack*/)(*)[Size])(&elems) &
elems,
partition_type, PartitionType::physical_mem_num,
PartitionType::partition_dim);
if constexpr (partition_type == partition::type::block)
partition::xilinx_partition_array(&(*this)[0], partition_type,
PartitionType::ele_in_each_physical_mem,
PartitionType::partition_dim);
partition::xilinx_partition_array(
(ValueType __SYCL_DEVICE_ADDRSPACE(0 /*stack*/)(*)[Size])(&elems) &
elems,
partition_type, PartitionType::ele_in_each_physical_mem,
PartitionType::partition_dim);
if constexpr (partition_type == partition::type::complete)
partition::xilinx_partition_array(&(*this)[0], partition_type, 0,
PartitionType::partition_dim);
partition::xilinx_partition_array(
(ValueType __SYCL_DEVICE_ADDRSPACE(0 /*stack*/)(*)[Size])(&elems),
partition_type, 0, PartitionType::partition_dim);
}

/// A constructor from some container
Expand Down
30 changes: 25 additions & 5 deletions sycl/test/lit.cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
# test_source_root: The root path where tests are located.
config.test_source_root = os.path.dirname(__file__)

timeout=600

# test_exec_root: The root path where tests should be run.
config.test_exec_root = os.path.join(config.sycl_obj_root, 'test')

Expand Down Expand Up @@ -80,7 +82,6 @@
config.substitutions.append( ('%sycl_be', backend) )

xocc=lit_config.params.get('XOCC', "off")
lit_config.note("XOCC={}".format(xocc))

get_device_count_by_type_path = os.path.join(config.llvm_tools_dir, "get_device_count_by_type")

Expand Down Expand Up @@ -248,6 +249,13 @@ def getDeviceCount(device_type):
required_env = ['HOME', 'USER', 'XILINX_XRT', 'XILINX_SDX', 'XILINX_PLATFORM', 'EMCONFIG_PATH', 'LIBRARY_PATH']
has_error=False
config.available_features.add("xocc")
config.available_features.add(xocc_target)
pkg_opencv4 = subprocess.run(["pkg-config", "--libs", "--cflags", "opencv4"], stdout=subprocess.PIPE)
has_opencv4 = not pkg_opencv4.returncode
lit_config.note("has opencv4: {}".format(has_opencv4))
if has_opencv4:
config.available_features.add("opencv4")
config.substitutions.append( ('%opencv4_flags', pkg_opencv4.stdout.decode('utf-8')[:-1]) )
for env in required_env:
if env not in os.environ:
lit_config.note("missing environnement variable: {}".format(env))
Expand All @@ -258,13 +266,25 @@ def getDeviceCount(device_type):
if xocc == "only":
config.name = 'SYCL-XOCC'
config.test_source_root = config.test_source_root + "/xocc_tests"
run_if_hw="echo"
run_if_hw_emu="echo"
run_if_sw_emu="echo"
if xocc_target == "hw":
timeout = 10800 # 3h
run_if_hw=""
if xocc_target == "hw_emu":
timeout = 1800 # 30min
run_if_hw_emu=""
if xocc_target == "sw_emu":
run_if_sw_emu=""
config.substitutions.append( ('%run_if_hw', run_if_hw) )
config.substitutions.append( ('%run_if_hw_emu', run_if_hw_emu) )
config.substitutions.append( ('%run_if_sw_emu', run_if_sw_emu) )


# Set timeout for test = 10 mins
try:
import psutil
lit_config.maxIndividualTestTime = 600
if "XCL_EMULATION_MODE" in os.environ:
if os.environ["XCL_EMULATION_MODE"] == "hw":
lit_config.maxIndividualTestTime = 10800 # 3h
lit_config.maxIndividualTestTime = timeout
except ImportError:
pass
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
// RUN: true
// REQUIRES: xocc && opencv4

// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%sycl_triple -o %t.out %s %opencv4_flags
// RUN: %run_if_hw %ACC_RUN_PLACEHOLDER %t.out %S/data/input/eiffel.bmp
// RUN: %run_if_hw %ACC_RUN_PLACEHOLDER %t.out %S/data/input/lola.bmp
// RUN: %run_if_hw %ACC_RUN_PLACEHOLDER %t.out %S/data/input/vase.bmp
// RUN: %run_if_sw_emu %ACC_RUN_PLACEHOLDER %t.out %S/data/input/eiffel.bmp
// RUN: %run_if_sw_emu %ACC_RUN_PLACEHOLDER %t.out %S/data/input/lola.bmp
// RUN: %run_if_sw_emu %ACC_RUN_PLACEHOLDER %t.out %S/data/input/vase.bmp

/*
Attempt at translating SDAccel Examples edge_detection example to SYCL
Expand Down Expand Up @@ -29,7 +37,7 @@
#include <opencv2/imgproc/imgproc.hpp>
#include <opencv2/highgui/highgui.hpp>

#include "../../utilities/device_selectors.hpp"
#include "../utilities/device_selectors.hpp"

using namespace cl::sycl;

Expand All @@ -50,9 +58,9 @@ int main(int argc, char* argv[]) {
// using fixed constexpr values stays more true to the original implementation
// however you can in theory just use input.rows/cols to support a wider range
// of image sizes.
constexpr auto height = 1895; // input.rows;
constexpr auto width = 1024; // input.cols;
constexpr auto area = height * width;
const size_t height = input.rows;
const size_t width = input.cols;
const size_t area = height * width;

selector_defines::CompiledForDeviceSelector selector;
queue q {selector, property::queue::enable_profiling()};
Expand All @@ -71,6 +79,7 @@ int main(int argc, char* argv[]) {
std::cout << "Max Energy = " << ceil(log2((long long)iMax * 2 * 3 * 3)) + 1
<< " Bits \n";
std::cout << "Image Dimensions: " << input.size() << "\n";
std::cout << "Used Size: " << height << "x" << width << " = " << area << "\n";

// mapping the enqueueTask call to a single_task, interested in seeing if a
// parallel_for without a fixed 1-1-1 mapping is workable on an FPGA though..
Expand All @@ -92,10 +101,10 @@ int main(int argc, char* argv[]) {
// to parallel_fors with local sizes
[=]() {
auto gX = xilinx::partition_array<char, 9,
xilinx::partition::complete<0>>({-1, 0, 1, -2, 0, 2, -1, 0, 1});
xilinx::partition::complete<1>>({-1, 0, 1, -2, 0, 2, -1, 0, 1});

auto gY = xilinx::partition_array<char, 9,
xilinx::partition::complete<0>>({1, 2, 1, 0, 0, 0, -1, -2, -1});
xilinx::partition::complete<1>>({1, 2, 1, 0, 0, 0, -1, -2, -1});

int magX, magY, gI, pIndex;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,10 @@ int main(int argc, char* argv[]) {

cgh.single_task<xilinx::reqd_work_group_size<1, 1, 1, krnl_sobel>>([=] {
xilinx::partition_array
<char, 9,xilinx::partition::complete<0>> gX
<char, 9,xilinx::partition::complete<1>> gX
{ {-1, 0, 1, -2, 0, 2, -1, 0, 1} };
xilinx::partition_array
<char, 9,xilinx::partition::complete<0>> gY
<char, 9,xilinx::partition::complete<1>> gY
{ {1, 2, 1, 0, 0, 0, -1, -2, -1} };

for (size_t x = 1; x < width - 1; ++x) {
Expand Down
2 changes: 1 addition & 1 deletion sycl/tools/sycl-xocc/bin/sycl-xocc
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ if [[ ! ${#KERNEL_NAME_ARRAY[@]} -eq 0 ]]; then
--save-temps -o "$OUTPUT_FILE_NAME" $LINKER_LIST $@

cd $CWD
cp -r $LINKER_WD/* .
cp -rf $LINKER_WD/* .

# This step is unconnected to the generation of the final xcl binary, it dumps
# useful debug information from the final generated xcl binary into the temp
Expand Down

0 comments on commit 22603c6

Please sign in to comment.