New unit test framework for operators (#91)

microsoft · Aug 16, 2023 · c1a73b9 · c1a73b9
1 parent 2d455d7
commit c1a73b9
Show file tree

Hide file tree

Showing 26 changed files with 1,560 additions and 1,138 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,33 @@
+{
+    "configurations": [
+    {
+        "name": "ops_matmul_test",
+        "type": "cppdbg",
+        "request": "launch",
+        "program": "${workspaceFolder}/build/ark/ops_matmul_test.cu",
+        "args": [],
+        "stopAtEntry": false,
+        "cwd": "${fileDirname}",
+        "environment": [
+            {
+                "name": "ARK_ROOT",
+                "value": "${workspaceFolder}/build"
+            }
+        ],
+        "externalConsole": false,
+        "MIMode": "gdb",
+        "setupCommands": [
+            {
+                "description": "Enable pretty-printing for gdb",
+                "text": "-enable-pretty-printing",
+                "ignoreFailures": true
+            },
+            {
+                "description": "Set Disassembly Flavor to Intel",
+                "text": "-gdb-set disassembly-flavor intel",
+                "ignoreFailures": true
+            }
+        ]
+    }
+    ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -3,4 +3,7 @@
     "cmake.environment": {
         "ARK_ROOT": "${workspaceFolder}/build"
     },
+    "cmake.ctestArgs": [
+        "--verbose"
+    ],
 }
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -11,9 +11,11 @@ set(ARK_SOVERSION "${ARK_MAJOR}.${ARK_MINOR}")
 option(USE_KAHYPAR "Use KaHyPar for scheduling" OFF)
 
 cmake_minimum_required(VERSION 3.25)
-project(ark LANGUAGES CXX)
-set(CMAKE_CXX_STANDARD 14)
+project(ark LANGUAGES CXX CUDA)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wextra")
 set(BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
 # Find ibverbs
@@ -24,6 +26,19 @@ include(${PROJECT_SOURCE_DIR}/cmake/FindNUMA.cmake)
 
 # Find CUDAToolkit
 find_package(CUDAToolkit REQUIRED)
+if(CUDAToolkit_FOUND)
+    if(CUDAToolkit_VERSION_MAJOR LESS 11)
+        message(FATAL_ERROR "CUDA 11 or higher is required but detected ${CUDAToolkit_VERSION}")
+    endif()
+
+    if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 11)
+        set(CMAKE_CUDA_ARCHITECTURES 70 80)
+    endif()
+
+    if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 12)
+        set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} 70 80 90)
+    endif()
+endif()
 
 # Third party libraries
 add_subdirectory(third_party)

diff --git a/ark/CMakeLists.txt b/ark/CMakeLists.txt
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.
 
 file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc)
-file(GLOB_RECURSE UT_SOURCES CONFIGURE_DEPENDS *_test.cc)
+file(GLOB_RECURSE UT_SOURCES CONFIGURE_DEPENDS *_test.cc *_test.cu)
 file(GLOB_RECURSE UT_COMMON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unittest/*.cc)
 list(REMOVE_ITEM SOURCES ${UT_SOURCES} ${UT_COMMON_SOURCES})
 file(GLOB_RECURSE INTERFACE_HEADERS CONFIGURE_DEPENDS include/ark*.h)
@@ -61,7 +61,7 @@ foreach(ut_source IN ITEMS ${UT_SOURCES})
     add_executable(${exe_name} ${ut_source} ${UT_COMMON_SOURCES})
     add_dependencies(${exe_name} build)
     set_target_properties(${exe_name} PROPERTIES EXCLUDE_FROM_ALL TRUE)
-    target_link_libraries(${exe_name} PRIVATE ark_obj ${COMMON_LIBS})
+    target_link_libraries(${exe_name} PRIVATE ark_obj ${COMMON_LIBS} CUDA::cudart CUDA::cublas)
     target_include_directories(${exe_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
     target_include_directories(${exe_name} SYSTEM PRIVATE
         ${PROJECT_SOURCE_DIR}/third_party/json

diff --git a/ark/gpu/gpu_compile.cc b/ark/gpu/gpu_compile.cc
@@ -16,6 +16,7 @@
 #include "gpu/gpu_compile.h"
 #include "gpu/gpu_logging.h"
 #include "include/ark.h"
+#include "random.h"
 #include "threading.h"
 
 #define ARK_USE_NVRTC 0
@@ -27,21 +28,6 @@
 
 using namespace std;
 
-// Generate a random alpha-numeric string.
-static const string rand_anum(size_t len)
-{
-    auto randchar = []() -> char {
-        const char charset[] = "0123456789"
-                               "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-                               "abcdefghijklmnopqrstuvwxyz";
-        const size_t max_index = sizeof(charset) - 1;
-        return charset[rand() % max_index];
-    };
-    string str(len, 0);
-    generate_n(str.begin(), len, randchar);
-    return str;
-}
-
 namespace ark {
 
 #if (ARK_USE_NVRTC)

diff --git a/ark/gpu/gpu_kernel.cc b/ark/gpu/gpu_kernel.cc
@@ -86,46 +86,40 @@ void GpuKernel::compile(const GpuInfo &gpu_info, bool use_comm_sw)
         this->cubin =
             gpu_compile(this->codes, gpu_info.arch, max_reg_cnt, use_comm_sw);
     }
-}
 
-//
-void GpuKernel::load()
-{
     //
-    unsigned int buflen = 8192;
-    char *infobuf = new char[buflen];
-    char *errbuf = new char[buflen];
-    assert(infobuf != nullptr);
-    assert(errbuf != nullptr);
+    size_t num_opts = 5;
+    size_t buflen = 8192;
+    std::unique_ptr<CUjit_option[]> opts(new CUjit_option[num_opts]);
+    std::unique_ptr<void *[]> optvals(new void *[num_opts]);
+    std::string infobuf;
+    std::string errbuf;
+
+    infobuf.resize(buflen, ' ');
+    errbuf.resize(buflen, ' ');
+
     int enable = 1;
-    int num_opts = 5;
-    CUjit_option *opts = new CUjit_option[num_opts];
-    void **optvals = new void *[num_opts];
-    assert(opts != nullptr);
-    assert(optvals != nullptr);
 
     opts[0] = CU_JIT_INFO_LOG_BUFFER;
-    optvals[0] = (void *)infobuf;
+    optvals[0] = (void *)infobuf.data();
 
     opts[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
     optvals[1] = (void *)(long)buflen;
 
     opts[2] = CU_JIT_ERROR_LOG_BUFFER;
-    optvals[2] = (void *)errbuf;
+    optvals[2] = (void *)errbuf.data();
 
     opts[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
     optvals[3] = (void *)(long)buflen;
 
     opts[4] = CU_JIT_GENERATE_DEBUG_INFO;
     optvals[4] = (void *)(long)enable;
 
-    if (cuModuleLoadDataEx(&this->module, this->cubin.c_str(), num_opts, opts,
-                           optvals) != CUDA_SUCCESS) {
+    if (cuModuleLoadDataEx(&this->module, this->cubin.c_str(), num_opts,
+                           opts.get(), optvals.get()) != CUDA_SUCCESS) {
         LOG(DEBUG, infobuf);
         LOG(ERROR, "cuModuleLoadDataEx() failed: ", errbuf);
     }
-    delete[] infobuf;
-    delete[] errbuf;
     CULOG(cuModuleGetFunction(&this->kernel, this->module, this->name.c_str()));
     //
     int static_smem_size_bytes;
@@ -136,8 +130,6 @@ void GpuKernel::load()
     CULOG(cuFuncSetAttribute(this->kernel,
                              CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
                              dynamic_smem_size_bytes));
-    // Now code string is not needed.
-    // this->code.clear();
 }
 
 //
@@ -265,7 +257,6 @@ void GpuLoopKernel::compile(const GpuInfo &gpu_info)
 void GpuLoopKernel::load()
 {
     this->ctx->set_current();
-    GpuKernel::load();
     //
     if (!this->is_compiled()) {
         LOG(ERROR, "Need to compile first before initialization.");
@@ -368,25 +359,10 @@ GpuState GpuLoopKernel::launch(CUstream stream, bool disable_timing)
 void GpuLoopKernel::run(int iter)
 {
     if (iter > 0) {
-#if 0
-        int idx = this->flip_flag ? 0 : 1;
-        int rem = iter;
-        while (rem--) {
-            while (this->get_flag(idx) > 0) {
-                cpu_ntimer_sleep(500);
-            }
-            this->set_flag(idx, 1);
-            idx ^= 1;
-        }
-        if (iter & 1) {
-            this->flip_flag = !(this->flip_flag);
-        }
-#else
         volatile int *href = this->flag_href;
         while (*href > 0) {
         }
         *href = iter;
-#endif
     }
 }
 

diff --git a/ark/gpu/gpu_kernel.h b/ark/gpu/gpu_kernel.h
@@ -33,7 +33,6 @@ class GpuKernel
     ~GpuKernel();
 
     void compile(const GpuInfo &gpu_info, bool use_comm_sw = true);
-    void load();
     GpuState launch(GpuStream stream);
 
     const std::string &get_name()

diff --git a/ark/include/ark.h b/ark/include/ark.h
@@ -171,9 +171,21 @@ class Tensor
     /// After read, the data in the host buffer will be 0, 1, 2, 4, 5, 6.
     ///
     /// @param buf The host buffer to copy to. The buffer must be large enough
-    /// to hold the data.
+    /// to hold the data. If @p buf is nullptr, a new buffer will be allocated.
+    /// @return The host buffer that holds the data.
+    ///
+    void *read(void *buf = nullptr);
+
+    /// Copy all the underlying buffer data (including padding) to a contiguous
+    /// host buffer.
+    ///
+    /// This function is mainly for debugging purposes.
+    ///
+    /// @param buf The host buffer to copy to. The buffer must be large enough
+    /// to hold the data. If @p buf is nullptr, a new buffer will be allocated.
+    /// @return The host buffer that holds the data.
     ///
-    void read(void *buf);
+    void *read_raw(void *buf = nullptr);
 
     /// Set all bytes of the tensor buffer to 0.
     void clear();

diff --git a/ark/include/ark_utils.h b/ark/include/ark_utils.h
@@ -51,6 +51,8 @@ template <typename T> std::unique_ptr<T[]> rand_array(size_t num, float max_val)
 std::unique_ptr<half_t[]> rand_halfs(size_t num, float max_val);
 // Return a random float array.
 std::unique_ptr<float[]> rand_floats(size_t num, float max_val);
+// Return a random bytes array.
+std::unique_ptr<uint8_t[]> rand_bytes(size_t num);
 
 // Return a half_t range array.
 std::unique_ptr<half_t[]> range_halfs(size_t num, float begin = 1.0f,
@@ -79,32 +81,6 @@ template <typename T> std::unique_ptr<T[]> ones(size_t num)
     return std::unique_ptr<T[]>(ret);
 }
 
-// Return the error rate between two values.
-float error_rate(half_t a, half_t b);
-float error_rate(float a, float b);
-
-// Return mean squared error and max error rate between two matrices.
-std::pair<float, float> cmp_matrix(half_t *ground_truth, half_t *res,
-                                   unsigned int m, unsigned int n,
-                                   unsigned int bs = 1, unsigned int lm = 0,
-                                   unsigned int ln = 0, bool print = false);
-std::pair<float, float> cmp_matrix(float *ground_truth, float *res,
-                                   unsigned int m, unsigned int n,
-                                   unsigned int bs = 1, unsigned int lm = 0,
-                                   unsigned int ln = 0, bool print = false);
-
-// Print a matrix.
-void print_matrix(half_t *val, unsigned int m, unsigned int n, unsigned int bs,
-                  unsigned int lm, unsigned int ln);
-void print_matrix(float *val, unsigned int m, unsigned int n, unsigned int bs,
-                  unsigned int lm, unsigned int ln);
-
-//
-std::pair<float, float> tensor_compare(half_t *ground_truth, half_t *res,
-                                       Dims shape, bool print);
-std::pair<float, float> tensor_compare(float *ground_truth, float *res,
-                                       Dims shape, bool print);
-
 // Spawn a process that runs `func`. Returns PID of the spawned process.
 int proc_spawn(const std::function<int()> &func);
 // Wait for a spawned process with PID `pid`.

diff --git a/ark/include/kernels/unit_op.h b/ark/include/kernels/unit_op.h
@@ -65,9 +65,10 @@ struct UnitOp
     static_assert(_SmemBytes >= 0, "Bytes of shared memory is negative");
 
     // Number of unit operators in each dimension.
-    using UnitOpDims =
-        Vec<_OutDims::N / _UnitOutDims::N, _OutDims::C / _UnitOutDims::C,
-            _OutDims::H / _UnitOutDims::H, _OutDims::W / _UnitOutDims::W>;
+    using UnitOpDims = Vec<math::div_up<_OutShape::N, _UnitOutDims::N>::value,
+                           math::div_up<_OutShape::C, _UnitOutDims::C>::value,
+                           math::div_up<_OutShape::H, _UnitOutDims::H>::value,
+                           math::div_up<_OutShape::W, _UnitOutDims::W>::value>;
 
     static const int NumThreads = _NumThreads;
     static const int SmemBytes = _SmemBytes;