Skip to content

Commit

Permalink
New unit test framework for operators (#91)
Browse files Browse the repository at this point in the history
  • Loading branch information
chhwang committed Aug 16, 2023
1 parent 2d455d7 commit c1a73b9
Show file tree
Hide file tree
Showing 26 changed files with 1,560 additions and 1,138 deletions.
33 changes: 33 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"configurations": [
{
"name": "ops_matmul_test",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceFolder}/build/ark/ops_matmul_test.cu",
"args": [],
"stopAtEntry": false,
"cwd": "${fileDirname}",
"environment": [
{
"name": "ARK_ROOT",
"value": "${workspaceFolder}/build"
}
],
"externalConsole": false,
"MIMode": "gdb",
"setupCommands": [
{
"description": "Enable pretty-printing for gdb",
"text": "-enable-pretty-printing",
"ignoreFailures": true
},
{
"description": "Set Disassembly Flavor to Intel",
"text": "-gdb-set disassembly-flavor intel",
"ignoreFailures": true
}
]
}
]
}
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,7 @@
"cmake.environment": {
"ARK_ROOT": "${workspaceFolder}/build"
},
"cmake.ctestArgs": [
"--verbose"
],
}
19 changes: 17 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@ set(ARK_SOVERSION "${ARK_MAJOR}.${ARK_MINOR}")
option(USE_KAHYPAR "Use KaHyPar for scheduling" OFF)

cmake_minimum_required(VERSION 3.25)
project(ark LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 14)
project(ark LANGUAGES CXX CUDA)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wextra")
set(BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR})

# Find ibverbs
Expand All @@ -24,6 +26,19 @@ include(${PROJECT_SOURCE_DIR}/cmake/FindNUMA.cmake)

# Find CUDAToolkit
find_package(CUDAToolkit REQUIRED)
if(CUDAToolkit_FOUND)
if(CUDAToolkit_VERSION_MAJOR LESS 11)
message(FATAL_ERROR "CUDA 11 or higher is required but detected ${CUDAToolkit_VERSION}")
endif()

if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 11)
set(CMAKE_CUDA_ARCHITECTURES 70 80)
endif()

if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 12)
set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} 70 80 90)
endif()
endif()

# Third party libraries
add_subdirectory(third_party)
Expand Down
4 changes: 2 additions & 2 deletions ark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Licensed under the MIT license.

file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc)
file(GLOB_RECURSE UT_SOURCES CONFIGURE_DEPENDS *_test.cc)
file(GLOB_RECURSE UT_SOURCES CONFIGURE_DEPENDS *_test.cc *_test.cu)
file(GLOB_RECURSE UT_COMMON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unittest/*.cc)
list(REMOVE_ITEM SOURCES ${UT_SOURCES} ${UT_COMMON_SOURCES})
file(GLOB_RECURSE INTERFACE_HEADERS CONFIGURE_DEPENDS include/ark*.h)
Expand Down Expand Up @@ -61,7 +61,7 @@ foreach(ut_source IN ITEMS ${UT_SOURCES})
add_executable(${exe_name} ${ut_source} ${UT_COMMON_SOURCES})
add_dependencies(${exe_name} build)
set_target_properties(${exe_name} PROPERTIES EXCLUDE_FROM_ALL TRUE)
target_link_libraries(${exe_name} PRIVATE ark_obj ${COMMON_LIBS})
target_link_libraries(${exe_name} PRIVATE ark_obj ${COMMON_LIBS} CUDA::cudart CUDA::cublas)
target_include_directories(${exe_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories(${exe_name} SYSTEM PRIVATE
${PROJECT_SOURCE_DIR}/third_party/json
Expand Down
16 changes: 1 addition & 15 deletions ark/gpu/gpu_compile.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "gpu/gpu_compile.h"
#include "gpu/gpu_logging.h"
#include "include/ark.h"
#include "random.h"
#include "threading.h"

#define ARK_USE_NVRTC 0
Expand All @@ -27,21 +28,6 @@

using namespace std;

// Generate a random alpha-numeric string.
static const string rand_anum(size_t len)
{
auto randchar = []() -> char {
const char charset[] = "0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz";
const size_t max_index = sizeof(charset) - 1;
return charset[rand() % max_index];
};
string str(len, 0);
generate_n(str.begin(), len, randchar);
return str;
}

namespace ark {

#if (ARK_USE_NVRTC)
Expand Down
52 changes: 14 additions & 38 deletions ark/gpu/gpu_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,46 +86,40 @@ void GpuKernel::compile(const GpuInfo &gpu_info, bool use_comm_sw)
this->cubin =
gpu_compile(this->codes, gpu_info.arch, max_reg_cnt, use_comm_sw);
}
}

//
void GpuKernel::load()
{
//
unsigned int buflen = 8192;
char *infobuf = new char[buflen];
char *errbuf = new char[buflen];
assert(infobuf != nullptr);
assert(errbuf != nullptr);
size_t num_opts = 5;
size_t buflen = 8192;
std::unique_ptr<CUjit_option[]> opts(new CUjit_option[num_opts]);
std::unique_ptr<void *[]> optvals(new void *[num_opts]);
std::string infobuf;
std::string errbuf;

infobuf.resize(buflen, ' ');
errbuf.resize(buflen, ' ');

int enable = 1;
int num_opts = 5;
CUjit_option *opts = new CUjit_option[num_opts];
void **optvals = new void *[num_opts];
assert(opts != nullptr);
assert(optvals != nullptr);

opts[0] = CU_JIT_INFO_LOG_BUFFER;
optvals[0] = (void *)infobuf;
optvals[0] = (void *)infobuf.data();

opts[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
optvals[1] = (void *)(long)buflen;

opts[2] = CU_JIT_ERROR_LOG_BUFFER;
optvals[2] = (void *)errbuf;
optvals[2] = (void *)errbuf.data();

opts[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
optvals[3] = (void *)(long)buflen;

opts[4] = CU_JIT_GENERATE_DEBUG_INFO;
optvals[4] = (void *)(long)enable;

if (cuModuleLoadDataEx(&this->module, this->cubin.c_str(), num_opts, opts,
optvals) != CUDA_SUCCESS) {
if (cuModuleLoadDataEx(&this->module, this->cubin.c_str(), num_opts,
opts.get(), optvals.get()) != CUDA_SUCCESS) {
LOG(DEBUG, infobuf);
LOG(ERROR, "cuModuleLoadDataEx() failed: ", errbuf);
}
delete[] infobuf;
delete[] errbuf;
CULOG(cuModuleGetFunction(&this->kernel, this->module, this->name.c_str()));
//
int static_smem_size_bytes;
Expand All @@ -136,8 +130,6 @@ void GpuKernel::load()
CULOG(cuFuncSetAttribute(this->kernel,
CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
dynamic_smem_size_bytes));
// Now code string is not needed.
// this->code.clear();
}

//
Expand Down Expand Up @@ -265,7 +257,6 @@ void GpuLoopKernel::compile(const GpuInfo &gpu_info)
void GpuLoopKernel::load()
{
this->ctx->set_current();
GpuKernel::load();
//
if (!this->is_compiled()) {
LOG(ERROR, "Need to compile first before initialization.");
Expand Down Expand Up @@ -368,25 +359,10 @@ GpuState GpuLoopKernel::launch(CUstream stream, bool disable_timing)
void GpuLoopKernel::run(int iter)
{
if (iter > 0) {
#if 0
int idx = this->flip_flag ? 0 : 1;
int rem = iter;
while (rem--) {
while (this->get_flag(idx) > 0) {
cpu_ntimer_sleep(500);
}
this->set_flag(idx, 1);
idx ^= 1;
}
if (iter & 1) {
this->flip_flag = !(this->flip_flag);
}
#else
volatile int *href = this->flag_href;
while (*href > 0) {
}
*href = iter;
#endif
}
}

Expand Down
1 change: 0 additions & 1 deletion ark/gpu/gpu_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ class GpuKernel
~GpuKernel();

void compile(const GpuInfo &gpu_info, bool use_comm_sw = true);
void load();
GpuState launch(GpuStream stream);

const std::string &get_name()
Expand Down
16 changes: 14 additions & 2 deletions ark/include/ark.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,21 @@ class Tensor
/// After read, the data in the host buffer will be 0, 1, 2, 4, 5, 6.
///
/// @param buf The host buffer to copy to. The buffer must be large enough
/// to hold the data.
/// to hold the data. If @p buf is nullptr, a new buffer will be allocated.
/// @return The host buffer that holds the data.
///
void *read(void *buf = nullptr);

/// Copy all the underlying buffer data (including padding) to a contiguous
/// host buffer.
///
/// This function is mainly for debugging purposes.
///
/// @param buf The host buffer to copy to. The buffer must be large enough
/// to hold the data. If @p buf is nullptr, a new buffer will be allocated.
/// @return The host buffer that holds the data.
///
void read(void *buf);
void *read_raw(void *buf = nullptr);

/// Set all bytes of the tensor buffer to 0.
void clear();
Expand Down
28 changes: 2 additions & 26 deletions ark/include/ark_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ template <typename T> std::unique_ptr<T[]> rand_array(size_t num, float max_val)
std::unique_ptr<half_t[]> rand_halfs(size_t num, float max_val);
// Return a random float array.
std::unique_ptr<float[]> rand_floats(size_t num, float max_val);
// Return a random bytes array.
std::unique_ptr<uint8_t[]> rand_bytes(size_t num);

// Return a half_t range array.
std::unique_ptr<half_t[]> range_halfs(size_t num, float begin = 1.0f,
Expand Down Expand Up @@ -79,32 +81,6 @@ template <typename T> std::unique_ptr<T[]> ones(size_t num)
return std::unique_ptr<T[]>(ret);
}

// Return the error rate between two values.
float error_rate(half_t a, half_t b);
float error_rate(float a, float b);

// Return mean squared error and max error rate between two matrices.
std::pair<float, float> cmp_matrix(half_t *ground_truth, half_t *res,
unsigned int m, unsigned int n,
unsigned int bs = 1, unsigned int lm = 0,
unsigned int ln = 0, bool print = false);
std::pair<float, float> cmp_matrix(float *ground_truth, float *res,
unsigned int m, unsigned int n,
unsigned int bs = 1, unsigned int lm = 0,
unsigned int ln = 0, bool print = false);

// Print a matrix.
void print_matrix(half_t *val, unsigned int m, unsigned int n, unsigned int bs,
unsigned int lm, unsigned int ln);
void print_matrix(float *val, unsigned int m, unsigned int n, unsigned int bs,
unsigned int lm, unsigned int ln);

//
std::pair<float, float> tensor_compare(half_t *ground_truth, half_t *res,
Dims shape, bool print);
std::pair<float, float> tensor_compare(float *ground_truth, float *res,
Dims shape, bool print);

// Spawn a process that runs `func`. Returns PID of the spawned process.
int proc_spawn(const std::function<int()> &func);
// Wait for a spawned process with PID `pid`.
Expand Down
7 changes: 4 additions & 3 deletions ark/include/kernels/unit_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,10 @@ struct UnitOp
static_assert(_SmemBytes >= 0, "Bytes of shared memory is negative");

// Number of unit operators in each dimension.
using UnitOpDims =
Vec<_OutDims::N / _UnitOutDims::N, _OutDims::C / _UnitOutDims::C,
_OutDims::H / _UnitOutDims::H, _OutDims::W / _UnitOutDims::W>;
using UnitOpDims = Vec<math::div_up<_OutShape::N, _UnitOutDims::N>::value,
math::div_up<_OutShape::C, _UnitOutDims::C>::value,
math::div_up<_OutShape::H, _UnitOutDims::H>::value,
math::div_up<_OutShape::W, _UnitOutDims::W>::value>;

static const int NumThreads = _NumThreads;
static const int SmemBytes = _SmemBytes;
Expand Down
Loading

0 comments on commit c1a73b9

Please sign in to comment.