From 3096adebae17dd17e2c88b3c051dcc1ad9e10ebf Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Wed, 15 Feb 2017 00:18:10 +0100 Subject: [PATCH 01/14] Add initial R stub --- src/CMakeLists.txt | 16 ++++++++++--- src/cmake/FindR.cmake | 55 +++++++++++++++++++++++++++++++++++++++++++ src/r.cc | 24 +++++++++++++++++++ 3 files changed, 92 insertions(+), 3 deletions(-) create mode 100644 src/cmake/FindR.cmake create mode 100644 src/r.cc diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 319319f..77a392a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,6 @@ cmake_minimum_required(VERSION 3.2) project(KMCUDA) +set(CMAKE_MODULE_PATH ${CMAKE_HOME_DIRECTORY}/cmake) find_package(OpenMP REQUIRED) if (APPLE AND NOT CUDA_HOST_COMPILER) # https://gitlab.kitware.com/cmake/cmake/issues/13674 @@ -24,7 +25,9 @@ if (NOT DISABLE_PYTHON) execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import numpy; print(numpy.get_include())" OUTPUT_VARIABLE NUMPY_INCLUDES) endif() endif() - +if (NOT DISABLE_R) + find_package(R) +endif() if (PROFILE OR CMAKE_BUILD_TYPE STREQUAL "Debug") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DPROFILE") endif() @@ -35,9 +38,12 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -Wall -Werror -DCUDA_ARCH=${CUDA_ARCH} -std=c++11 ${OpenMP_CXX_FLAGS}") set(SOURCE_FILES kmcuda.cc kmcuda.h wrappers.h private.h fp_abstraction.h tricks.cuh metric_abstraction.h kmeans.cu knn.cu transpose.cu) -if (NOT DISABLE_PYTHON) +if (PYTHONLIBS_FOUND) list(APPEND SOURCE_FILES python.cc) endif() +if (R_FOUND) + list(APPEND SOURCE_FILES r.cc) +endif() if (CMAKE_BUILD_TYPE STREQUAL "Debug") set(NVCC_FLAGS "-G -g") endif() @@ -59,10 +65,14 @@ if (APPLE) set(CMAKE_SHARED_LIBRARY_CXX_FLAGS "${CMAKE_SHARED_LIBRARY_CXX_FLAGS_BACKUP}") endif() target_link_libraries(KMCUDA ${CUDA_curand_LIBRARY}) -if(PYTHONLIBS_FOUND) +if (PYTHONLIBS_FOUND) include_directories(${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDES}) target_link_libraries(KMCUDA ${PYTHON_LIBRARIES}) endif() +if (R_FOUND) + include_directories(${R_INCLUDE_DIRS}) + target_link_libraries(KMCUDA ${R_LIBRARIES}) +endif() if (SUFFIX) set_target_properties(KMCUDA PROPERTIES SUFFIX ${SUFFIX}) endif() diff --git a/src/cmake/FindR.cmake b/src/cmake/FindR.cmake new file mode 100644 index 0000000..2802671 --- /dev/null +++ b/src/cmake/FindR.cmake @@ -0,0 +1,55 @@ +# CMake module to find R +# - Try to find R +# Once done, this will define +# +# R_FOUND - system has R +# R_INCLUDE_DIRS - the R include directories +# R_LIBRARIES - link these to use R +# R_ROOT_DIR - As reported by R +# Autor: Omar Andres Zapata Mesa 31/05/2013 +if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") + set(CMAKE_FIND_APPBUNDLE "LAST") +endif() +find_program(R_EXECUTABLE NAMES R R.exe) +#---searching R installtion unsing R executable +if(R_EXECUTABLE) + execute_process(COMMAND ${R_EXECUTABLE} RHOME + OUTPUT_VARIABLE R_ROOT_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE) + find_path(R_INCLUDE_DIR R.h + HINTS ${R_ROOT_DIR} + PATHS /usr/local/lib /usr/local/lib64 /usr/share + PATH_SUFFIXES include R/include + DOC "Path to file R.h") + find_library(R_LIBRARY R + HINTS ${R_ROOT_DIR}/lib + DOC "R library (example libR.a, libR.dylib, etc.).") +endif() +#---setting include dirs and libraries +set(R_LIBRARIES ${R_LIBRARY}) +set(R_INCLUDE_DIRS ${R_INCLUDE_DIR}) +foreach(_cpt ${R_FIND_COMPONENTS}) + execute_process(COMMAND echo "cat(find.package('${_cpt}'))" + COMMAND ${R_EXECUTABLE} --vanilla --slave + OUTPUT_VARIABLE _cpt_path + OUTPUT_STRIP_TRAILING_WHITESPACE) + find_library(R_${_cpt}_LIBRARY + lib${_cpt}.so lib${_cpt}.dylib + HINTS ${_cpt_path}/lib) + if(R_${_cpt}_LIBRARY) + mark_as_advanced(R_${_cpt}_LIBRARY) + list(APPEND R_LIBRARIES ${R_${_cpt}_LIBRARY}) + endif() + find_path(R_${_cpt}_INCLUDE_DIR ${_cpt}.h HINTS ${_cpt_path} PATH_SUFFIXES include R/include) + if(R_${_cpt}_INCLUDE_DIR) + mark_as_advanced(R_${_cpt}_INCLUDE_DIR) + list(APPEND R_INCLUDE_DIRS ${R_${_cpt}_INCLUDE_DIR}) + endif() + if(R_${_cpt}_INCLUDE_DIR AND R_${_cpt}_LIBRARY) + list(REMOVE_ITEM R_FIND_COMPONENTS ${_cpt}) + endif() +endforeach() +# Handle the QUIETLY and REQUIRED arguments and set R_FOUND to TRUE if all listed variables are TRUE +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(R DEFAULT_MSG R_EXECUTABLE R_INCLUDE_DIR R_LIBRARY) +mark_as_advanced(R_FOUND R_EXECUTABLE R_INCLUDE_DIR R_LIBRARY) diff --git a/src/r.cc b/src/r.cc new file mode 100644 index 0000000..21936bb --- /dev/null +++ b/src/r.cc @@ -0,0 +1,24 @@ +#include +#include +#include +#include "kmcuda.h" + +extern "C" { + +static SEXP r_kmeans_cuda(SEXP args); + +static R_ExternalMethodDef externalMethods[] = { + {"kmeans_cuda", (DL_FUNC) &r_kmeans_cuda, -1}, + {NULL, NULL, 0} +}; + +void R_init_libKMCUDA(DllInfo *info) { + R_registerRoutines(info, NULL, NULL, NULL, externalMethods); +} + +static SEXP r_kmeans_cuda(SEXP args) { + Rprintf("%d arguments\n", length(args)); + return R_NilValue; +} + +} // extern "C" From 25abc4554ceec8a476412a3b8164b1fc47fea4bc Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Wed, 15 Feb 2017 09:59:38 +0100 Subject: [PATCH 02/14] Add second benchmark --- README.md | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a2ac5b9..74167bd 100644 --- a/README.md +++ b/README.md @@ -33,11 +33,14 @@ Table of contents * [macOS](#macos) * [Testing](#testing) * [Benchmarks](#benchmarks) - * [100000x256@1024](#100000x2561024) + * [100,000x256@1024](#100000x2561024) * [Configuration](#configuration) * [Contestants](#contestants) * [Data](#data) * [Notes](#notes-1) + * [8,000,000x256@1024](#8000000x2561024) + * [Data](#data-1) + * [Notes](#notes-2) * [Python examples](#python-examples) * [K-means, L2 (Euclidean) distance](#k-means-l2-euclidean-distance) * [K-means, angular (cosine) distance average](#k-means-angular-cosine-distance--average) @@ -167,8 +170,6 @@ Benchmarks ---------- ### 100000x256@1024 -Comparison of some KMeans implementations: - | | sklearn KMeans | KMeansRex | KMeansRex OpenMP | Serban | kmcuda | kmcuda 2 GPU | |------------|----------------|-----------|------------------|--------|--------|--------------| | time, s | 164 | 36 | 20 | 10.6 | 9.2 | 5.5 | @@ -193,6 +194,21 @@ Comparison of some KMeans implementations: #### Notes 100000 is the maximum size Serban KMeans can handle. +### 8000000x256@1024 +| | sklearn KMeans | KMeansRex | KMeansRex OpenMP | Serban | kmcuda 2 GPU | kmcuda Yinyang 2 GPU | +|------------|----------------|-----------|------------------|--------|--------------|----------------------| +| time | please no | - | 6h 34m | fail | 44m | 37m | +| memory, GB | - | - | 205 | fail | 8.7 | 10.4 | + +kmeans++ initialization, 93 iterations (1% reassignments equivalent). + +#### Data +8,000,000 secret production samples. + +#### Notes +KmeansRex did eat 205 GB of RAM on peak; it uses dynamic memory so it constantly +bounced from 100 GB to 200 GB. + Python examples --------------- From bfae3b06ec3259850aca5c26aae9ab35d10db36f Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Wed, 15 Feb 2017 12:16:05 +0100 Subject: [PATCH 03/14] Optimize bounds access in Yinyang --- README.md | 2 +- src/kmeans.cu | 49 ++++++++++++++++++++++++------------------------- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 74167bd..c87aa10 100644 --- a/README.md +++ b/README.md @@ -197,7 +197,7 @@ Benchmarks ### 8000000x256@1024 | | sklearn KMeans | KMeansRex | KMeansRex OpenMP | Serban | kmcuda 2 GPU | kmcuda Yinyang 2 GPU | |------------|----------------|-----------|------------------|--------|--------------|----------------------| -| time | please no | - | 6h 34m | fail | 44m | 37m | +| time | please no | - | 6h 34m | fail | 44m | 36m | | memory, GB | - | - | 205 | fail | 8.7 | 10.4 | kmeans++ initialization, 93 iterations (1% reassignments equivalent). diff --git a/src/kmeans.cu b/src/kmeans.cu index 2a3cbf4..f509878 100644 --- a/src/kmeans.cu +++ b/src/kmeans.cu @@ -437,11 +437,9 @@ __global__ void kmeans_yy_init( if (sample >= length) { return; } - bounds += static_cast(sample) * (d_yy_groups_size + 1); for (uint32_t i = 0; i < d_yy_groups_size + 1; i++) { - bounds[i] = FLT_MAX; + bounds[static_cast(length) * i + sample] = FLT_MAX; } - bounds++; uint32_t nearest = assignments[sample]; extern __shared__ float shared_memory[]; F *volatile shared_centroids = reinterpret_cast(shared_memory); @@ -475,11 +473,12 @@ __global__ void kmeans_yy_init( samples, shared_centroids + (c - gc) * d_features_size, d_samples_size, sample + offset); if (c != nearest) { - if (dist < bounds[group]) { - bounds[group] = dist; + uint64_t gindex = static_cast(length) * (1 + group) + sample; + if (dist < bounds[gindex]) { + bounds[gindex] = dist; } } else { - bounds[-1] = dist; + bounds[sample] = dist; } } } @@ -549,44 +548,42 @@ __global__ void kmeans_yy_global_filter( if (sample >= length) { return; } - bounds += static_cast(sample) * (d_yy_groups_size + 1); uint32_t cluster = assignments[sample]; assignments_prev[sample] = cluster; - float upper_bound = bounds[0]; + float upper_bound = bounds[sample]; uint32_t doffset = d_clusters_size * d_features_size; float cluster_drift = drifts[doffset + cluster]; upper_bound += cluster_drift; - bounds++; float min_lower_bound = FLT_MAX; for (uint32_t g = 0; g < d_yy_groups_size; g++) { - float lower_bound = bounds[g] - drifts[g]; - bounds[g] = lower_bound; + uint64_t gindex = static_cast(length) * (1 + g) + sample; + float lower_bound = bounds[gindex] - drifts[g]; + bounds[gindex] = lower_bound; if (lower_bound < min_lower_bound) { min_lower_bound = lower_bound; } } - bounds--; // group filter try #1 if (min_lower_bound >= upper_bound) { - bounds[0] = upper_bound; + bounds[sample] = upper_bound; return; } upper_bound = 0; upper_bound = METRIC::distance_t( samples, centroids + cluster * d_features_size, d_samples_size, sample + offset); - bounds[0] = upper_bound; + bounds[sample] = upper_bound; // group filter try #2 if (min_lower_bound >= upper_bound) { return; } - // D'oh! + // d'oh! passed[atomicAggInc(&d_passed_number)] = sample; } template __global__ void kmeans_yy_local_filter( - const uint32_t offset, const F *__restrict__ samples, + const uint32_t offset, const uint32_t length, const F *__restrict__ samples, const uint32_t *__restrict__ passed, const F *__restrict__ centroids, const uint32_t *__restrict__ groups, const float *__restrict__ drifts, uint32_t *__restrict__ assignments, float *__restrict__ bounds) { @@ -595,9 +592,7 @@ __global__ void kmeans_yy_local_filter( return; } sample = passed[sample]; - bounds += static_cast(sample) * (d_yy_groups_size + 1); - float upper_bound = bounds[0]; - bounds++; + float upper_bound = bounds[sample]; uint32_t cluster = assignments[sample]; uint32_t doffset = d_clusters_size * d_features_size; float min_dist = upper_bound, second_min_dist = FLT_MAX; @@ -633,7 +628,8 @@ __global__ void kmeans_yy_local_filter( // this may happen if the centroid is insane (NaN) continue; } - float lower_bound = bounds[group]; + float lower_bound = bounds[ + static_cast(length) * (1 + group) + sample]; if (lower_bound >= upper_bound) { if (lower_bound < second_min_dist) { second_min_dist = lower_bound; @@ -658,14 +654,17 @@ __global__ void kmeans_yy_local_filter( } uint32_t nearest_group = groups[nearest]; uint32_t previous_group = groups[cluster]; - bounds[nearest_group] = second_min_dist; + bounds[static_cast(length) * (1 + nearest_group) + sample] = + second_min_dist; if (nearest_group != previous_group) { - float pb = bounds[previous_group]; + uint64_t gindex = + static_cast(length) * (1 + previous_group) + sample; + float pb = bounds[gindex]; if (pb > upper_bound) { - bounds[previous_group] = upper_bound; + bounds[gindex] = upper_bound; } } - bounds[-1] = min_dist; + bounds[sample] = min_dist; if (cluster != nearest) { assignments[sample] = nearest; atomicAggInc(&d_changed_number); @@ -1244,7 +1243,7 @@ KMCUDAResult kmeans_cuda_yy( (*bounds_yy)[devi].get(), (*passed_yy)[devi].get())); dim3 slgrid(upper(length, slblock.x), 1, 1); KERNEL_SWITCH(kmeans_yy_local_filter, <<>>( - offset, reinterpret_cast(samples[devi].get()), + offset, length, reinterpret_cast(samples[devi].get()), (*passed_yy)[devi].get(), reinterpret_cast((*centroids)[devi].get()), (*assignments_yy)[devi].get(), (*drifts_yy)[devi].get(), (*assignments)[devi].get() + offset, (*bounds_yy)[devi].get())); From 865e0bcad26863876ef45be6a0182f12c2ae8208 Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Wed, 15 Feb 2017 10:05:28 +0100 Subject: [PATCH 04/14] Add initial R implementation --- src/kmcuda.h | 34 ++++++++ src/python.cc | 24 +----- src/r.cc | 224 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 262 insertions(+), 20 deletions(-) diff --git a/src/kmcuda.h b/src/kmcuda.h index 61ca97b..0e1d4ae 100644 --- a/src/kmcuda.h +++ b/src/kmcuda.h @@ -102,4 +102,38 @@ KMCUDAResult knn_cuda( } // extern "C" #endif +#ifdef __cplusplus +#include + +namespace { +namespace kmcuda { +const std::unordered_map init_methods { + {"kmeans++", kmcudaInitMethodPlusPlus}, + {"k-means++", kmcudaInitMethodPlusPlus}, + {"afkmc2", kmcudaInitMethodAFKMC2}, + {"afk-mc2", kmcudaInitMethodAFKMC2}, + {"random", kmcudaInitMethodRandom} +}; + +const std::unordered_map metrics { + {"euclidean", kmcudaDistanceMetricL2}, + {"L2", kmcudaDistanceMetricL2}, + {"l2", kmcudaDistanceMetricL2}, + {"cos", kmcudaDistanceMetricCosine}, + {"cosine", kmcudaDistanceMetricCosine}, + {"angular", kmcudaDistanceMetricCosine} +}; + +const std::unordered_map statuses { + {kmcudaSuccess, "Success"}, + {kmcudaInvalidArguments, "InvalidArguments"}, + {kmcudaNoSuchDevice, "NoSuchDevice"}, + {kmcudaMemoryAllocationFailure, "MemoryAllocationFailure"}, + {kmcudaRuntimeError, "RuntimeError"}, + {kmcudaMemoryCopyError, "MemoryCopyError"} +}; +} // namespace kmcuda +} // namespace +#endif // __cplusplus + #endif //KMCUDA_KMCUDA_H diff --git a/src/python.cc b/src/python.cc index 94ad02a..879d0fa 100644 --- a/src/python.cc +++ b/src/python.cc @@ -68,22 +68,6 @@ class _pyobj : public pyobj_parent { using pyobj = _pyobj; using pyarray = _pyobj; -static const std::unordered_map init_methods { - {"kmeans++", kmcudaInitMethodPlusPlus}, - {"k-means++", kmcudaInitMethodPlusPlus}, - {"afkmc2", kmcudaInitMethodAFKMC2}, - {"random", kmcudaInitMethodRandom} -}; - -static const std::unordered_map metrics { - {"euclidean", kmcudaDistanceMetricL2}, - {"L2", kmcudaDistanceMetricL2}, - {"l2", kmcudaDistanceMetricL2}, - {"cos", kmcudaDistanceMetricCosine}, - {"cosine", kmcudaDistanceMetricCosine}, - {"angular", kmcudaDistanceMetricCosine} -}; - static void set_cuda_malloc_error() { PyErr_SetString(PyExc_MemoryError, "Failed to allocate memory on GPU"); } @@ -106,8 +90,8 @@ static bool get_metric(PyObject *metric_obj, KMCUDADistanceMetric *metric) { return false; } else { pyobj bytes(PyUnicode_AsASCIIString(metric_obj)); - auto immetric = metrics.find(PyBytes_AsString(bytes.get())); - if (immetric == metrics.end()) { + auto immetric = kmcuda::metrics.find(PyBytes_AsString(bytes.get())); + if (immetric == kmcuda::metrics.end()) { PyErr_SetString( PyExc_ValueError, "Unknown metric. Supported values are \"L2\" and \"cos\"."); @@ -193,8 +177,8 @@ static PyObject *py_kmeans_cuda(PyObject *self, PyObject *args, PyObject *kwargs KMCUDAInitMethod init; auto set_init = [&init](PyObject *obj) { pyobj bytes(PyUnicode_AsASCIIString(obj)); - auto iminit = init_methods.find(PyBytes_AsString(bytes.get())); - if (iminit == init_methods.end()) { + auto iminit = kmcuda::init_methods.find(PyBytes_AsString(bytes.get())); + if (iminit == kmcuda::init_methods.end()) { PyErr_SetString( PyExc_ValueError, "Unknown centroids initialization method. Supported values are " diff --git a/src/r.cc b/src/r.cc index 21936bb..f9fa5d7 100644 --- a/src/r.cc +++ b/src/r.cc @@ -1,14 +1,57 @@ +#include +#include +#include + #include #include #include #include "kmcuda.h" +namespace { + std::unordered_map parse_args( + std::initializer_list required, SEXP args) { + std::unordered_map result; + args = CDR(args); + bool pure = true; + for (unsigned i = 0; args != R_NilValue; i++, args = CDR(args)) { + SEXP arg = CAR(args); + if (isNull(TAG(args))) { + if (pure && i == result.size()) { + result.emplace(required.begin()[i], arg); + } else { + error("positional argument follows keyword argument"); + } + } else { + pure = false; + result.emplace(CHAR(PRINTNAME(TAG(args))), arg); + } + } + return result; + } + + template + R parse_dict(const std::unordered_map& dict, const char *arg, + SEXP name) { + if (!isString(name)) { + error("\"%s\" name must be a string", arg); + } + const char *init_name = CHAR(asChar(name)); + auto init_iter = dict.find(init_name); + if (init_iter == dict.end()) { + error("\"%s\" = \"%s\" is not supported", arg, init_name); + } + return init_iter->second; + } +} + extern "C" { static SEXP r_kmeans_cuda(SEXP args); +static SEXP r_knn_cuda(SEXP args); static R_ExternalMethodDef externalMethods[] = { {"kmeans_cuda", (DL_FUNC) &r_kmeans_cuda, -1}, + {"knn_cuda", (DL_FUNC) &r_knn_cuda, -1}, {NULL, NULL, 0} }; @@ -17,6 +60,187 @@ void R_init_libKMCUDA(DllInfo *info) { } static SEXP r_kmeans_cuda(SEXP args) { + auto kwargs = parse_args({"samples", "clusters"}, args); + std::unique_ptr samples_chunks; + int chunks_size = 0; + { + SEXP samples_obj = kwargs["samples"]; + if (isVector(samples_obj)) { + chunks_size = length(samples_obj); + samples_chunks.reset(new SEXP[chunks_size]); + for (unsigned i = 0; samples_obj != R_NilValue; + i++, samples_obj = CDR(samples_obj)) { + samples_chunks[i] = CAR(samples_obj); + } + } else { + chunks_size = 1; + samples_chunks.reset(new SEXP[1]); + samples_chunks[0] = samples_obj; + } + } + int samples_size = 0, features_size = 0; + for (int i = 0; i < chunks_size; i++) { + if (!isReal(samples_chunks[i])) { + error("\"samples\" must be a 2D real matrix or a vector of 2D real matrices"); + } + SEXP dims = getAttrib(samples_chunks[i], R_DimSymbol); + if (length(dims) != 2) { + error("\"samples\" must be a 2D real matrix or a vector of 2D real matrices"); + } + int samples_size_i = INTEGER(dims)[0]; + if (static_cast(samples_size) + samples_size_i > INT32_MAX) { + error("too many samples (>INT32_MAX)"); + } + samples_size += samples_size_i; + int features_size_i = INTEGER(dims)[1]; + if (i == 0) { + features_size = features_size_i; + } else if (features_size != features_size_i) { + error("\"samples\" vector contains matrices with different number of columns"); + } + } + auto samples = std::unique_ptr(new float[ + static_cast(samples_size) * features_size]); + float *samples_float = samples.get(); + { + int offset = 0; + for (int i = 0; i < chunks_size; i++) { + double *samples_double = REAL(samples_chunks[i]); + SEXP dims = getAttrib(samples_chunks[i], R_DimSymbol); + int samples_size_i = INTEGER(dims)[0] * features_size; + #pragma omp parallel for simd + for (int i = 0; i < samples_size_i; i++) { + samples_float[offset + i] = samples_double[i]; + } + offset += samples_size_i; + } + } + SEXP clusters_obj = kwargs["clusters"]; + if (!isInteger(clusters_obj)) { + error("\"clusters\" must be a positive integer"); + } + int clusters_size = INTEGER(clusters_obj)[0]; + if (clusters_size <= 0) { + error("\"clusters\" must be a positive integer"); + } + if (static_cast(clusters_size) * features_size > INT32_MAX + || clusters_size >= samples_size) { + error("\"clusters\" is too big"); + } + KMCUDAInitMethod init = kmcudaInitMethodPlusPlus; + int afkmc2_m = 0; + auto init_iter = kwargs.find("init"); + if (init_iter != kwargs.end()) { + if (isString(init_iter->second)) { + init = parse_dict(kmcuda::init_methods, "init", init_iter->second); + } else if (isList(init_iter->second)) { + if (length(init_iter->second) == 0) { + error("\"init\" may not be an empty list"); + } + init = parse_dict(kmcuda::init_methods, "init", CAR(init_iter->second)); + if (init == kmcudaInitMethodAFKMC2 && length(init_iter->second) > 1) { + SEXP afkmc2_m_obj = CAAR(init_iter->second); + if (!isInteger(afkmc2_m_obj)) { + error("\"init\" = %s: parameter must be a positive integer", + CHAR(asChar(CAR(init_iter->second)))); + } + afkmc2_m = INTEGER(afkmc2_m_obj)[0]; + if (afkmc2_m <= 0) { + error("\"init\" = %s: parameter must be a positive integer", + CHAR(asChar(CAR(init_iter->second)))); + } + } else if (length(init_iter->second) != 1) { + error("\"init\" has wrong number of parameters"); + } + } else { + error("\"init\" must be either a string or a list"); + } + } + float tolerance = 0.01; + auto tolerance_iter = kwargs.find("tolerance"); + if (tolerance_iter != kwargs.end()) { + if (!isReal(tolerance_iter->second)) { + error("\"tolerance\" must be a real value"); + } + tolerance = REAL(tolerance_iter->second)[0]; + if (tolerance < 0 || tolerance > 1) { + error("\"tolerance\" must be in [0, 1]"); + } + } + float yinyang_t = 0.1; + auto yinyang_t_iter = kwargs.find("yinyang_t"); + if (yinyang_t_iter != kwargs.end()) { + if (!isReal(yinyang_t_iter->second)) { + error("\"yinyang_t\" must be a real value"); + } + yinyang_t = REAL(yinyang_t_iter->second)[0]; + if (yinyang_t < 0 || yinyang_t > 0.5) { + error("\"tolerance\" must be in [0, 0.5]"); + } + } + KMCUDADistanceMetric metric = kmcudaDistanceMetricL2; + auto metric_iter = kwargs.find("metric"); + if (metric_iter != kwargs.end()) { + metric = parse_dict(kmcuda::metrics, "metric", metric_iter->second); + } + uint32_t seed = static_cast(time(NULL)); + auto seed_iter = kwargs.find("seed"); + if (seed_iter != kwargs.end()) { + seed = static_cast(INTEGER(seed_iter->second)[0]); + } + int device = 0; + auto device_iter = kwargs.find("device"); + if (device_iter != kwargs.end()) { + device = INTEGER(device_iter->second)[0]; + if (device < 0) { + error("\"device\" may not be negative"); + } + } + int verbosity = 0; + auto verbosity_iter = kwargs.find("verbosity"); + if (verbosity_iter != kwargs.end()) { + verbosity = INTEGER(verbosity_iter->second)[0]; + } + float average_distance, *average_distance_ptr = nullptr; + auto average_distance_iter = kwargs.find("average_distance"); + if (average_distance_iter != kwargs.end()) { + if (LOGICAL(average_distance_iter->second)[0]) { + average_distance_ptr = &average_distance; + } + } + auto centroids = std::unique_ptr( + new float[clusters_size * features_size]); + auto assignments = std::unique_ptr(new uint32_t[samples_size]); + auto result = kmeans_cuda( + init, &afkmc2_m, tolerance, yinyang_t, metric, samples_size, features_size, + clusters_size, seed, device, -1, 0, verbosity, samples_float, + centroids.get(), assignments.get(), average_distance_ptr); + if (result != kmcudaSuccess) { + error("kmeans_cuda error %d %s%s", result, + kmcuda::statuses.find(result)->second, (verbosity > 0)? "" : "; " + "\"verbosity\" = 2 would reveal the details"); + } + SEXP centroids2 = PROTECT(allocMatrix(REALSXP, clusters_size, features_size)); + double *centroids_double = REAL(centroids2); + float *centroids_float = centroids.get(); + #pragma omp parallel for simd + for (int i = 0; i < clusters_size * features_size; i++) { + centroids_float[i] = centroids_double[i]; + } + SEXP assignments2 = PROTECT(allocVector(INTSXP, samples_size)); + memcpy(INTEGER(assignments2), assignments.get(), samples_size * sizeof(int)); + SEXP tuple = PROTECT(allocVector(VECSXP, 2 + (average_distance_ptr != nullptr))); + SET_VECTOR_ELT(tuple, 0, centroids2); + SET_VECTOR_ELT(tuple, 1, assignments2); + if (average_distance_ptr != nullptr) { + SEXP average_distance2 = PROTECT(allocVector(REALSXP, 1)); + REAL(average_distance2)[0] = average_distance; + } + UNPROTECT(3 + (average_distance_ptr != nullptr)); + return tuple; +} + +static SEXP r_knn_cuda(SEXP args) { Rprintf("%d arguments\n", length(args)); return R_NilValue; } From d45e726f3ebbc305f46f0782a54e453a785683b5 Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Fri, 17 Feb 2017 23:07:38 +0100 Subject: [PATCH 05/14] Fix the glue so that the test launches --- src/r.cc | 59 +++++++++++++++++++++++++++++++++--------------------- src/test.R | 27 +++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 23 deletions(-) create mode 100644 src/test.R diff --git a/src/r.cc b/src/r.cc index f9fa5d7..cd98223 100644 --- a/src/r.cc +++ b/src/r.cc @@ -42,6 +42,25 @@ namespace { } return init_iter->second; } + + int parse_int(SEXP value) { + if (isInteger(value)) { + return INTEGER(value)[0]; + } + return REAL(value)[0]; + } + + int parse_int(const std::unordered_map &kwargs, + const std::string &name, int def) { + auto iter = kwargs.find(name); + if (iter == kwargs.end()) { + return def; + } + if (!isNumeric(iter->second)) { + error("\"%s\" must be an integer", name.c_str()); + } + return parse_int(iter->second); + } } extern "C" { @@ -65,7 +84,7 @@ static SEXP r_kmeans_cuda(SEXP args) { int chunks_size = 0; { SEXP samples_obj = kwargs["samples"]; - if (isVector(samples_obj)) { + if (TYPEOF(samples_obj) == VECSXP) { chunks_size = length(samples_obj); samples_chunks.reset(new SEXP[chunks_size]); for (unsigned i = 0; samples_obj != R_NilValue; @@ -116,10 +135,10 @@ static SEXP r_kmeans_cuda(SEXP args) { } } SEXP clusters_obj = kwargs["clusters"]; - if (!isInteger(clusters_obj)) { + if (!isNumeric(clusters_obj)) { error("\"clusters\" must be a positive integer"); } - int clusters_size = INTEGER(clusters_obj)[0]; + int clusters_size = parse_int(clusters_obj); if (clusters_size <= 0) { error("\"clusters\" must be a positive integer"); } @@ -140,11 +159,11 @@ static SEXP r_kmeans_cuda(SEXP args) { init = parse_dict(kmcuda::init_methods, "init", CAR(init_iter->second)); if (init == kmcudaInitMethodAFKMC2 && length(init_iter->second) > 1) { SEXP afkmc2_m_obj = CAAR(init_iter->second); - if (!isInteger(afkmc2_m_obj)) { + if (!isNumeric(afkmc2_m_obj)) { error("\"init\" = %s: parameter must be a positive integer", CHAR(asChar(CAR(init_iter->second)))); } - afkmc2_m = INTEGER(afkmc2_m_obj)[0]; + afkmc2_m = parse_int(afkmc2_m_obj); if (afkmc2_m <= 0) { error("\"init\" = %s: parameter must be a positive integer", CHAR(asChar(CAR(init_iter->second)))); @@ -183,24 +202,12 @@ static SEXP r_kmeans_cuda(SEXP args) { if (metric_iter != kwargs.end()) { metric = parse_dict(kmcuda::metrics, "metric", metric_iter->second); } - uint32_t seed = static_cast(time(NULL)); - auto seed_iter = kwargs.find("seed"); - if (seed_iter != kwargs.end()) { - seed = static_cast(INTEGER(seed_iter->second)[0]); - } - int device = 0; - auto device_iter = kwargs.find("device"); - if (device_iter != kwargs.end()) { - device = INTEGER(device_iter->second)[0]; - if (device < 0) { - error("\"device\" may not be negative"); - } - } - int verbosity = 0; - auto verbosity_iter = kwargs.find("verbosity"); - if (verbosity_iter != kwargs.end()) { - verbosity = INTEGER(verbosity_iter->second)[0]; + uint32_t seed = parse_int(kwargs, "seed", time(NULL)); + int device = parse_int(kwargs, "device", 0); + if (device < 0) { + error("\"device\" may not be negative"); } + int verbosity = parse_int(kwargs, "verbosity", 0); float average_distance, *average_distance_ptr = nullptr; auto average_distance_iter = kwargs.find("average_distance"); if (average_distance_iter != kwargs.end()) { @@ -232,11 +239,17 @@ static SEXP r_kmeans_cuda(SEXP args) { SEXP tuple = PROTECT(allocVector(VECSXP, 2 + (average_distance_ptr != nullptr))); SET_VECTOR_ELT(tuple, 0, centroids2); SET_VECTOR_ELT(tuple, 1, assignments2); + SEXP names = PROTECT(allocVector( + STRSXP, 2 + (average_distance_ptr != nullptr))); + SET_STRING_ELT(names, 0, mkChar("centroids")); + SET_STRING_ELT(names, 1, mkChar("assignments")); if (average_distance_ptr != nullptr) { SEXP average_distance2 = PROTECT(allocVector(REALSXP, 1)); REAL(average_distance2)[0] = average_distance; + SET_STRING_ELT(names, 2, mkChar("average_distance")); } - UNPROTECT(3 + (average_distance_ptr != nullptr)); + setAttrib(tuple, R_NamesSymbol, names); + UNPROTECT(4 + (average_distance_ptr != nullptr)); return tuple; } diff --git a/src/test.R b/src/test.R new file mode 100644 index 0000000..862c41c --- /dev/null +++ b/src/test.R @@ -0,0 +1,27 @@ +library(testthat) + +if (exists("testing")) { + setwd(cwd) + dyn.load("libKMCUDA.so") + + context("K-means") + test_that("Random Lloyd",{ + samples <- replicate(4, rnorm(16000)) + ret <- .External("kmeans_cuda", samples, 50, init="random", seed=777, verbosity=2) + print(ret) + }) +} else { + testing <- TRUE + cwd <- getwd() + thisFile <- function() { + cmdArgs <- commandArgs(trailingOnly=FALSE) + needle <- "--file=" + match <- grep(needle, cmdArgs) + if (length(match) > 0) { + return(normalizePath(sub(needle, "", cmdArgs[match]))) + } else { + return(normalizePath(sys.frames()[[1]]$ofile)) + } + } + test_results <- test_dir(dirname(thisFile()), reporter="summary") +} From 2868527379859f32a5a882f7649d3d8db508a2e9 Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Sat, 18 Feb 2017 11:09:57 +0100 Subject: [PATCH 06/14] Pass two basic tests --- src/r.cc | 4 ++-- src/test.R | 25 +++++++++++++++++++++---- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/r.cc b/src/r.cc index cd98223..9fc3186 100644 --- a/src/r.cc +++ b/src/r.cc @@ -152,7 +152,7 @@ static SEXP r_kmeans_cuda(SEXP args) { if (init_iter != kwargs.end()) { if (isString(init_iter->second)) { init = parse_dict(kmcuda::init_methods, "init", init_iter->second); - } else if (isList(init_iter->second)) { + } else if (TYPEOF(init_iter->second) == VECSXP) { if (length(init_iter->second) == 0) { error("\"init\" may not be an empty list"); } @@ -232,7 +232,7 @@ static SEXP r_kmeans_cuda(SEXP args) { float *centroids_float = centroids.get(); #pragma omp parallel for simd for (int i = 0; i < clusters_size * features_size; i++) { - centroids_float[i] = centroids_double[i]; + centroids_double[i] = centroids_float[i]; } SEXP assignments2 = PROTECT(allocVector(INTSXP, samples_size)); memcpy(INTEGER(assignments2), assignments.get(), samples_size * sizeof(int)); diff --git a/src/test.R b/src/test.R index 862c41c..4c03eaf 100644 --- a/src/test.R +++ b/src/test.R @@ -5,10 +5,27 @@ if (exists("testing")) { dyn.load("libKMCUDA.so") context("K-means") - test_that("Random Lloyd",{ - samples <- replicate(4, rnorm(16000)) - ret <- .External("kmeans_cuda", samples, 50, init="random", seed=777, verbosity=2) - print(ret) + test_that("Random",{ + set.seed(42) + samples <- replicate(4, runif(16000)) + result = .External("kmeans_cuda", samples, 50, tolerance=0.01, + init="random", seed=777, verbosity=2) + kmcuda_asses = replicate(1, result$assignments) + attach(kmeans(samples, result$centroids, iter.max=1)) + reasses = length(intersect(kmcuda_asses, cluster)) / length(cluster) + print(sprintf("Reassignments: %f", reasses)) + expect_lt(reasses, 0.01) + }) + test_that("SingleDeviceKmeans++Lloyd",{ + set.seed(42) + samples <- replicate(4, runif(16000)) + result = .External("kmeans_cuda", samples, 50, yinyang_t=0, device=1, + init="kmeans++", seed=777, verbosity=2) + kmcuda_asses = replicate(1, result$assignments) + attach(kmeans(samples, result$centroids, iter.max=1)) + reasses = length(intersect(kmcuda_asses, cluster)) / length(cluster) + print(sprintf("Reassignments: %f", reasses)) + expect_lt(reasses, 0.01) }) } else { testing <- TRUE From 4997bc8672e438870a30d7ed38bf626c5b87b5a6 Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Sat, 18 Feb 2017 21:25:27 +0100 Subject: [PATCH 07/14] Pass all the tests for R kmeans_cuda --- .gitignore | 2 ++ src/r.cc | 34 ++++++++++++++++++++++++---------- src/test.R | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 1303fee..5323a25 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ .idea +cmake-build-* +**/*.cbp **/CMakeCache.txt **/CMakeFiles **/.DS_Store diff --git a/src/r.cc b/src/r.cc index 9fc3186..016103d 100644 --- a/src/r.cc +++ b/src/r.cc @@ -84,17 +84,18 @@ static SEXP r_kmeans_cuda(SEXP args) { int chunks_size = 0; { SEXP samples_obj = kwargs["samples"]; - if (TYPEOF(samples_obj) == VECSXP) { + if (isReal(samples_obj)) { + chunks_size = 1; + samples_chunks.reset(new SEXP[1]); + samples_chunks[0] = samples_obj; + } else if (TYPEOF(samples_obj) == VECSXP) { chunks_size = length(samples_obj); samples_chunks.reset(new SEXP[chunks_size]); - for (unsigned i = 0; samples_obj != R_NilValue; - i++, samples_obj = CDR(samples_obj)) { - samples_chunks[i] = CAR(samples_obj); + for (int i = 0; i < chunks_size; i++) { + samples_chunks[i] = VECTOR_ELT(samples_obj, i); } } else { - chunks_size = 1; - samples_chunks.reset(new SEXP[1]); - samples_chunks[0] = samples_obj; + error("\"samples\" must be a 2D real matrix or a vector of 2D real matrices"); } } int samples_size = 0, features_size = 0; @@ -146,6 +147,8 @@ static SEXP r_kmeans_cuda(SEXP args) { || clusters_size >= samples_size) { error("\"clusters\" is too big"); } + auto centroids = std::unique_ptr( + new float[clusters_size * features_size]); KMCUDAInitMethod init = kmcudaInitMethodPlusPlus; int afkmc2_m = 0; auto init_iter = kwargs.find("init"); @@ -171,8 +174,21 @@ static SEXP r_kmeans_cuda(SEXP args) { } else if (length(init_iter->second) != 1) { error("\"init\" has wrong number of parameters"); } + } else if (isReal(init_iter->second)) { + init = kmcudaInitMethodImport; + double *centroids_double = REAL(init_iter->second); + SEXP dims = getAttrib(init_iter->second, R_DimSymbol); + if (length(dims) != 2 + || INTEGER(dims)[0] != clusters_size + || INTEGER(dims)[1] != features_size) { + error("invalid centroids dimensions in \"init\""); + } + #pragma omp parallel for simd + for (int i = 0; i < clusters_size * features_size; i++) { + centroids[i] = centroids_double[i]; + } } else { - error("\"init\" must be either a string or a list"); + error("\"init\" must be either a string or a list or a 2D matrix"); } } float tolerance = 0.01; @@ -215,8 +231,6 @@ static SEXP r_kmeans_cuda(SEXP args) { average_distance_ptr = &average_distance; } } - auto centroids = std::unique_ptr( - new float[clusters_size * features_size]); auto assignments = std::unique_ptr(new uint32_t[samples_size]); auto result = kmeans_cuda( init, &afkmc2_m, tolerance, yinyang_t, metric, samples_size, features_size, diff --git a/src/test.R b/src/test.R index 4c03eaf..1650c6e 100644 --- a/src/test.R +++ b/src/test.R @@ -27,6 +27,42 @@ if (exists("testing")) { print(sprintf("Reassignments: %f", reasses)) expect_lt(reasses, 0.01) }) + test_that("MultiSamples",{ + set.seed(42) + samples1 <- replicate(4, runif(16000)) + samples2 <- replicate(4, runif(16000)) + result = .External("kmeans_cuda", list(samples1, samples2), 50, + init="kmeans++", seed=777, verbosity=2) + kmcuda_asses = replicate(1, result$assignments) + expect_equal(length(kmcuda_asses), 32000) + attach(kmeans(rbind(samples1, samples2), result$centroids, iter.max=1)) + reasses = length(intersect(kmcuda_asses, cluster)) / length(cluster) + print(sprintf("Reassignments: %f", reasses)) + expect_lt(reasses, 0.01) + }) + test_that("AFK-MC2",{ + set.seed(42) + samples <- replicate(4, runif(16000)) + result = .External("kmeans_cuda", samples, 50, tolerance=0.01, + init=c("afkmc2", 100), seed=777, verbosity=2) + kmcuda_asses = replicate(1, result$assignments) + attach(kmeans(samples, result$centroids, iter.max=1)) + reasses = length(intersect(kmcuda_asses, cluster)) / length(cluster) + print(sprintf("Reassignments: %f", reasses)) + expect_lt(reasses, 0.01) + }) + test_that("ImportCentroids",{ + set.seed(42) + samples <- replicate(4, runif(16000)) + centroids <- replicate(4, runif(50)) + result = .External("kmeans_cuda", samples, 50, tolerance=0.01, + init=centroids, seed=777, verbosity=2) + kmcuda_asses = replicate(1, result$assignments) + attach(kmeans(samples, result$centroids, iter.max=1)) + reasses = length(intersect(kmcuda_asses, cluster)) / length(cluster) + print(sprintf("Reassignments: %f", reasses)) + expect_lt(reasses, 0.01) + }) } else { testing <- TRUE cwd <- getwd() From 21bc6c46284eb62205d341d8ab8b223ee942180b Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Sun, 19 Feb 2017 00:04:53 +0100 Subject: [PATCH 08/14] Add knn code and refactor r_kmeans_cuda --- src/r.cc | 245 +++++++++++++++++++++++++++++++++++++---------------- src/test.R | 9 ++ 2 files changed, 183 insertions(+), 71 deletions(-) diff --git a/src/r.cc b/src/r.cc index 016103d..c8f66d3 100644 --- a/src/r.cc +++ b/src/r.cc @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -9,6 +10,7 @@ namespace { std::unordered_map parse_args( + const std::unordered_set &allowed, std::initializer_list required, SEXP args) { std::unordered_map result; args = CDR(args); @@ -23,7 +25,11 @@ namespace { } } else { pure = false; - result.emplace(CHAR(PRINTNAME(TAG(args))), arg); + const char *name = CHAR(PRINTNAME(TAG(args))); + if (allowed.find(name) == allowed.end()) { + error("got an unexpected keyword argument \"%s\"", name); + } + result.emplace(name, arg); } } return result; @@ -61,6 +67,103 @@ namespace { } return parse_int(iter->second); } + + void parse_samples( + const std::unordered_map &kwargs, + std::unique_ptr *samples, uint32_t *samples_size, + uint16_t *features_size) { + std::unique_ptr samples_chunks; + int chunks_size = 0; + { + auto samples_iter = kwargs.find("samples"); + if (samples_iter == kwargs.end()) { + error("\"samples\" must be defined"); + } + SEXP samples_obj = samples_iter->second; + if (isReal(samples_obj)) { + chunks_size = 1; + samples_chunks.reset(new SEXP[1]); + samples_chunks[0] = samples_obj; + } else if (TYPEOF(samples_obj) == VECSXP) { + chunks_size = length(samples_obj); + samples_chunks.reset(new SEXP[chunks_size]); + for (int i = 0; i < chunks_size; i++) { + samples_chunks[i] = VECTOR_ELT(samples_obj, i); + } + } else { + error("\"samples\" must be a 2D real matrix or a vector of 2D real matrices"); + } + } + *samples_size = 0; + for (int i = 0; i < chunks_size; i++) { + if (!isReal(samples_chunks[i])) { + error("\"samples\" must be a 2D real matrix or a vector of 2D real matrices"); + } + SEXP dims = getAttrib(samples_chunks[i], R_DimSymbol); + if (length(dims) != 2) { + error("\"samples\" must be a 2D real matrix or a vector of 2D real matrices"); + } + int samples_size_i = INTEGER(dims)[0]; + if (static_cast(*samples_size) + samples_size_i > INT32_MAX) { + error("too many samples (>INT32_MAX)"); + } + *samples_size += samples_size_i; + int features_size_i = INTEGER(dims)[1]; + if (features_size_i > UINT16_MAX) { + error("too many features (>UINT16_MAX)"); + } + if (i == 0) { + *features_size = features_size_i; + } else if (*features_size != features_size_i) { + error("\"samples\" vector contains matrices with different number of columns"); + } + } + samples->reset(new float[ + static_cast(*samples_size) * *features_size]); + float *samples_float = samples->get(); + { + int offset = 0; + for (int i = 0; i < chunks_size; i++) { + double *samples_double = REAL(samples_chunks[i]); + SEXP dims = getAttrib(samples_chunks[i], R_DimSymbol); + int samples_size_i = INTEGER(dims)[0] * *features_size; + #pragma omp parallel for simd + for (int i = 0; i < samples_size_i; i++) { + samples_float[offset + i] = samples_double[i]; + } + offset += samples_size_i; + } + } + } + + KMCUDADistanceMetric parse_metric( + const std::unordered_map &kwargs) { + KMCUDADistanceMetric metric = kmcudaDistanceMetricL2; + auto metric_iter = kwargs.find("metric"); + if (metric_iter != kwargs.end()) { + metric = parse_dict(kmcuda::metrics, "metric", metric_iter->second); + } + return metric; + } + + int parse_device( + const std::unordered_map &kwargs) { + int device = parse_int(kwargs, "device", 0); + if (device < 0) { + error("\"device\" may not be negative"); + } + return device; + } + + static const std::unordered_set kmeans_kwargs { + "samples", "clusters", "tolerance", "init", "yinyang_t", "metric", + "average_distance", "seed", "device", "verbosity" + }; + + static const std::unordered_set knn_kwargs { + "k", "samples", "centroids", "assignments", "metric", "device", + "verbosity" + }; } extern "C" { @@ -79,62 +182,11 @@ void R_init_libKMCUDA(DllInfo *info) { } static SEXP r_kmeans_cuda(SEXP args) { - auto kwargs = parse_args({"samples", "clusters"}, args); - std::unique_ptr samples_chunks; - int chunks_size = 0; - { - SEXP samples_obj = kwargs["samples"]; - if (isReal(samples_obj)) { - chunks_size = 1; - samples_chunks.reset(new SEXP[1]); - samples_chunks[0] = samples_obj; - } else if (TYPEOF(samples_obj) == VECSXP) { - chunks_size = length(samples_obj); - samples_chunks.reset(new SEXP[chunks_size]); - for (int i = 0; i < chunks_size; i++) { - samples_chunks[i] = VECTOR_ELT(samples_obj, i); - } - } else { - error("\"samples\" must be a 2D real matrix or a vector of 2D real matrices"); - } - } - int samples_size = 0, features_size = 0; - for (int i = 0; i < chunks_size; i++) { - if (!isReal(samples_chunks[i])) { - error("\"samples\" must be a 2D real matrix or a vector of 2D real matrices"); - } - SEXP dims = getAttrib(samples_chunks[i], R_DimSymbol); - if (length(dims) != 2) { - error("\"samples\" must be a 2D real matrix or a vector of 2D real matrices"); - } - int samples_size_i = INTEGER(dims)[0]; - if (static_cast(samples_size) + samples_size_i > INT32_MAX) { - error("too many samples (>INT32_MAX)"); - } - samples_size += samples_size_i; - int features_size_i = INTEGER(dims)[1]; - if (i == 0) { - features_size = features_size_i; - } else if (features_size != features_size_i) { - error("\"samples\" vector contains matrices with different number of columns"); - } - } - auto samples = std::unique_ptr(new float[ - static_cast(samples_size) * features_size]); - float *samples_float = samples.get(); - { - int offset = 0; - for (int i = 0; i < chunks_size; i++) { - double *samples_double = REAL(samples_chunks[i]); - SEXP dims = getAttrib(samples_chunks[i], R_DimSymbol); - int samples_size_i = INTEGER(dims)[0] * features_size; - #pragma omp parallel for simd - for (int i = 0; i < samples_size_i; i++) { - samples_float[offset + i] = samples_double[i]; - } - offset += samples_size_i; - } - } + auto kwargs = parse_args(kmeans_kwargs, {"samples", "clusters"}, args); + std::unique_ptr samples; + uint32_t samples_size; + uint16_t features_size; + parse_samples(kwargs, &samples, &samples_size, &features_size); SEXP clusters_obj = kwargs["clusters"]; if (!isNumeric(clusters_obj)) { error("\"clusters\" must be a positive integer"); @@ -144,7 +196,7 @@ static SEXP r_kmeans_cuda(SEXP args) { error("\"clusters\" must be a positive integer"); } if (static_cast(clusters_size) * features_size > INT32_MAX - || clusters_size >= samples_size) { + || static_cast(clusters_size) >= samples_size) { error("\"clusters\" is too big"); } auto centroids = std::unique_ptr( @@ -176,13 +228,13 @@ static SEXP r_kmeans_cuda(SEXP args) { } } else if (isReal(init_iter->second)) { init = kmcudaInitMethodImport; - double *centroids_double = REAL(init_iter->second); SEXP dims = getAttrib(init_iter->second, R_DimSymbol); if (length(dims) != 2 || INTEGER(dims)[0] != clusters_size || INTEGER(dims)[1] != features_size) { error("invalid centroids dimensions in \"init\""); } + double *centroids_double = REAL(init_iter->second); #pragma omp parallel for simd for (int i = 0; i < clusters_size * features_size; i++) { centroids[i] = centroids_double[i]; @@ -213,16 +265,9 @@ static SEXP r_kmeans_cuda(SEXP args) { error("\"tolerance\" must be in [0, 0.5]"); } } - KMCUDADistanceMetric metric = kmcudaDistanceMetricL2; - auto metric_iter = kwargs.find("metric"); - if (metric_iter != kwargs.end()) { - metric = parse_dict(kmcuda::metrics, "metric", metric_iter->second); - } + KMCUDADistanceMetric metric = parse_metric(kwargs); uint32_t seed = parse_int(kwargs, "seed", time(NULL)); - int device = parse_int(kwargs, "device", 0); - if (device < 0) { - error("\"device\" may not be negative"); - } + int device = parse_device(kwargs); int verbosity = parse_int(kwargs, "verbosity", 0); float average_distance, *average_distance_ptr = nullptr; auto average_distance_iter = kwargs.find("average_distance"); @@ -234,7 +279,7 @@ static SEXP r_kmeans_cuda(SEXP args) { auto assignments = std::unique_ptr(new uint32_t[samples_size]); auto result = kmeans_cuda( init, &afkmc2_m, tolerance, yinyang_t, metric, samples_size, features_size, - clusters_size, seed, device, -1, 0, verbosity, samples_float, + clusters_size, seed, device, -1, 0, verbosity, samples.get(), centroids.get(), assignments.get(), average_distance_ptr); if (result != kmcudaSuccess) { error("kmeans_cuda error %d %s%s", result, @@ -260,6 +305,7 @@ static SEXP r_kmeans_cuda(SEXP args) { if (average_distance_ptr != nullptr) { SEXP average_distance2 = PROTECT(allocVector(REALSXP, 1)); REAL(average_distance2)[0] = average_distance; + SET_VECTOR_ELT(tuple, 2, average_distance2); SET_STRING_ELT(names, 2, mkChar("average_distance")); } setAttrib(tuple, R_NamesSymbol, names); @@ -268,8 +314,65 @@ static SEXP r_kmeans_cuda(SEXP args) { } static SEXP r_knn_cuda(SEXP args) { - Rprintf("%d arguments\n", length(args)); - return R_NilValue; + auto kwargs = parse_args( + knn_kwargs, {"k", "samples", "centroids", "assignments"}, args); + int k = parse_int(kwargs, "k", 0); + if (k <= 0) { + error("\"k\" must be positive"); + } + std::unique_ptr samples; + uint32_t samples_size; + uint16_t features_size; + parse_samples(kwargs, &samples, &samples_size, &features_size); + auto centroids_iter = kwargs.find("centroids"); + if (centroids_iter == kwargs.end()) { + error("\"centroids\" must be specified"); + } + if (!isReal(centroids_iter->second)) { + error("\"centroids\" must be a 2D real matrix"); + } + SEXP dims = getAttrib(centroids_iter->second, R_DimSymbol); + if (length(dims) != 2 || INTEGER(dims)[1] != features_size) { + error("invalid \"centroids\"'s dimensions"); + } + int clusters_size = INTEGER(dims)[0]; + std::unique_ptr centroids(new float[clusters_size * features_size]); + double *centroids_double = REAL(centroids_iter->second); + float *centroids_float = centroids.get(); + #pragma omp parallel for simd + for (int i = 0; i < clusters_size * features_size; i++) { + centroids_float[i] = centroids_double[i]; + } + auto assignments_iter = kwargs.find("assignments"); + if (assignments_iter == kwargs.end()) { + error("\"assignments\" must be specified"); + } + if (!isInteger(assignments_iter->second)) { + error("\"assignments\" must be an integer vector"); + } + dims = getAttrib(assignments_iter->second, R_DimSymbol); + if (length(dims) != 1 + || static_cast(INTEGER(dims)[0]) != samples_size) { + error("invalid \"assignments\"'s dimensions"); + } + std::unique_ptr assignments(new uint32_t[samples_size]); + memcpy(assignments.get(), INTEGER(assignments_iter->second), + static_cast(samples_size) * sizeof(uint32_t)); + KMCUDADistanceMetric metric = parse_metric(kwargs); + int device = parse_device(kwargs); + int verbosity = parse_int(kwargs, "verbosity", 0); + SEXP neighbors = PROTECT(allocMatrix(INTSXP, samples_size, k)); + auto result = knn_cuda( + k, metric, samples_size, features_size, clusters_size, device, -1, 0, + verbosity, samples.get(), centroids.get(), assignments.get(), + reinterpret_cast(INTEGER(neighbors))); + if (result != kmcudaSuccess) { + error("knn_cuda error %d %s%s", result, + kmcuda::statuses.find(result)->second, (verbosity > 0)? "" : "; " + "\"verbosity\" = 2 would reveal the details"); + } + UNPROTECT(1); + return neighbors; } } // extern "C" diff --git a/src/test.R b/src/test.R index 1650c6e..dff5d83 100644 --- a/src/test.R +++ b/src/test.R @@ -63,6 +63,15 @@ if (exists("testing")) { print(sprintf("Reassignments: %f", reasses)) expect_lt(reasses, 0.01) }) + test_that("RandomPlusAverageDistance",{ + set.seed(42) + samples <- replicate(4, runif(16000)) + result = .External("kmeans_cuda", samples, 50, tolerance=0.01, + init="random", seed=777, verbosity=2, + average_distance=TRUE) + print(result$average_distance) + expect_equal(result$average_distance, 0.2127081, tolerance=0.0000001); + }) } else { testing <- TRUE cwd <- getwd() From b14efb945f3070042f5afb8280716a7dcb6c2065 Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Mon, 20 Feb 2017 23:01:14 +0100 Subject: [PATCH 09/14] Pass K-nn R test K-means tests are still partially broken --- src/kmcuda.cc | 25 ++++++++++++++++++ src/private.h | 4 +++ src/r.cc | 67 ++++++++++++++++++++++++++++++------------------ src/test.R | 16 ++++++++++++ src/test.py | 9 +++++++ src/transpose.cu | 11 ++++++++ 6 files changed, 107 insertions(+), 25 deletions(-) diff --git a/src/kmcuda.cc b/src/kmcuda.cc index 02079b9..92c7c2f 100644 --- a/src/kmcuda.cc +++ b/src/kmcuda.cc @@ -191,6 +191,31 @@ KMCUDAResult kmeans_init_centroids( uint32_t seed, const std::vector &devs, int device_ptrs, int fp16x2, int32_t verbosity, const float *host_centroids, const udevptrs &samples, udevptrs *dists, udevptrs *aux, udevptrs *centroids) { + if (metric == kmcudaDistanceMetricCosine) { + // 3 sanity checks + float *probe; + CUCH(cudaMallocManaged(reinterpret_cast(&probe), + static_cast(features_size) * sizeof(float)), + kmcudaMemoryAllocationFailure); + unique_devptr managed(probe); + cudaSetDevice(devs[0]); + for (uint32_t s : {0u, samples_size / 2, samples_size - 1}) { + RETERR(cuda_extract_sample_t( + s, samples_size, features_size, verbosity, samples[0].get(), probe)); + double norm = 0; + #pragma omp simd + for (uint16_t i = 0; i < features_size; i++) { + float v = probe[i]; + norm += v * v; + } + if (norm > 1.0001 || norm < 0.9999) { + INFO("error: angular distance: samples[%" PRIu32 "] has L2 norm = %f " + "which is outside [0.9999, 1.0001]\n", s, norm); + return kmcudaInvalidArguments; + } + } + } + srand(seed); switch (method) { case kmcudaInitMethodImport: diff --git a/src/private.h b/src/private.h index 1e93c89..0ea04e2 100644 --- a/src/private.h +++ b/src/private.h @@ -259,6 +259,10 @@ KMCUDAResult cuda_copy_sample_t( const std::vector &devs, int verbosity, const udevptrs &samples, udevptrs *dest); +KMCUDAResult cuda_extract_sample_t( + uint32_t index, uint32_t samples_size, uint16_t features_size, + int verbosity, const float *samples, float *dest); + KMCUDAResult cuda_transpose( uint32_t samples_size, uint16_t features_size, bool forward, const std::vector &devs, int verbosity, udevptrs *samples); diff --git a/src/r.cc b/src/r.cc index c8f66d3..02c45ca 100644 --- a/src/r.cc +++ b/src/r.cc @@ -126,12 +126,15 @@ namespace { for (int i = 0; i < chunks_size; i++) { double *samples_double = REAL(samples_chunks[i]); SEXP dims = getAttrib(samples_chunks[i], R_DimSymbol); - int samples_size_i = INTEGER(dims)[0] * *features_size; - #pragma omp parallel for simd - for (int i = 0; i < samples_size_i; i++) { - samples_float[offset + i] = samples_double[i]; + uint32_t fsize = *features_size; + uint32_t ssize = *samples_size; + #pragma omp parallel for + for (uint64_t f = 0; f < fsize; f++) { + for (uint64_t s = 0; s < ssize; s++) { + samples_float[offset + s * fsize + f] = samples_double[f * ssize + s]; + } } - offset += samples_size_i; + offset += INTEGER(dims)[0] * fsize; } } } @@ -235,9 +238,11 @@ static SEXP r_kmeans_cuda(SEXP args) { error("invalid centroids dimensions in \"init\""); } double *centroids_double = REAL(init_iter->second); - #pragma omp parallel for simd - for (int i = 0; i < clusters_size * features_size; i++) { - centroids[i] = centroids_double[i]; + #pragma omp parallel for + for (uint64_t f = 0; f < features_size; f++) { + for (int64_t c = 0; c < clusters_size; c++) { + centroids[c * features_size + f] = centroids_double[f * clusters_size + c]; + } } } else { error("\"init\" must be either a string or a list or a 2D matrix"); @@ -289,9 +294,11 @@ static SEXP r_kmeans_cuda(SEXP args) { SEXP centroids2 = PROTECT(allocMatrix(REALSXP, clusters_size, features_size)); double *centroids_double = REAL(centroids2); float *centroids_float = centroids.get(); - #pragma omp parallel for simd - for (int i = 0; i < clusters_size * features_size; i++) { - centroids_double[i] = centroids_float[i]; + #pragma omp parallel for + for (uint64_t f = 0; f < features_size; f++) { + for (int64_t c = 0; c < clusters_size; c++) { + centroids_double[f * clusters_size + c] = centroids_float[c * features_size + f]; + } } SEXP assignments2 = PROTECT(allocVector(INTSXP, samples_size)); memcpy(INTEGER(assignments2), assignments.get(), samples_size * sizeof(int)); @@ -324,6 +331,9 @@ static SEXP r_knn_cuda(SEXP args) { uint32_t samples_size; uint16_t features_size; parse_samples(kwargs, &samples, &samples_size, &features_size); + if (static_cast(samples_size) * k > INT32_MAX) { + error("too big \"k\": dim(samples)[0] * k > INT32_MAX"); + } auto centroids_iter = kwargs.find("centroids"); if (centroids_iter == kwargs.end()) { error("\"centroids\" must be specified"); @@ -339,9 +349,11 @@ static SEXP r_knn_cuda(SEXP args) { std::unique_ptr centroids(new float[clusters_size * features_size]); double *centroids_double = REAL(centroids_iter->second); float *centroids_float = centroids.get(); - #pragma omp parallel for simd - for (int i = 0; i < clusters_size * features_size; i++) { - centroids_float[i] = centroids_double[i]; + #pragma omp parallel for + for (uint64_t f = 0; f < features_size; f++) { + for (int64_t c = 0; c < clusters_size; c++) { + centroids_float[c * features_size + f] = centroids_double[f * clusters_size + c]; + } } auto assignments_iter = kwargs.find("assignments"); if (assignments_iter == kwargs.end()) { @@ -350,29 +362,34 @@ static SEXP r_knn_cuda(SEXP args) { if (!isInteger(assignments_iter->second)) { error("\"assignments\" must be an integer vector"); } - dims = getAttrib(assignments_iter->second, R_DimSymbol); - if (length(dims) != 1 - || static_cast(INTEGER(dims)[0]) != samples_size) { - error("invalid \"assignments\"'s dimensions"); + if (static_cast(length(assignments_iter->second)) != samples_size) { + error("invalid \"assignments\"'s length"); } - std::unique_ptr assignments(new uint32_t[samples_size]); - memcpy(assignments.get(), INTEGER(assignments_iter->second), - static_cast(samples_size) * sizeof(uint32_t)); KMCUDADistanceMetric metric = parse_metric(kwargs); int device = parse_device(kwargs); int verbosity = parse_int(kwargs, "verbosity", 0); - SEXP neighbors = PROTECT(allocMatrix(INTSXP, samples_size, k)); + std::unique_ptr neighbors(new uint32_t[samples_size * k]); auto result = knn_cuda( k, metric, samples_size, features_size, clusters_size, device, -1, 0, - verbosity, samples.get(), centroids.get(), assignments.get(), - reinterpret_cast(INTEGER(neighbors))); + verbosity, samples.get(), centroids.get(), + reinterpret_cast(INTEGER(assignments_iter->second)), + neighbors.get()); if (result != kmcudaSuccess) { error("knn_cuda error %d %s%s", result, kmcuda::statuses.find(result)->second, (verbosity > 0)? "" : "; " "\"verbosity\" = 2 would reveal the details"); } + SEXP neighbors_obj = PROTECT(allocMatrix(INTSXP, samples_size, k)); + const uint32_t *neighbors_ptr = neighbors.get(); + int *neighbors_obj_ptr = INTEGER(neighbors_obj); + #pragma omp parallel for + for (int i = 0; i < k; i++) { + for (uint32_t s = 0; s < samples_size; s++) { + neighbors_obj_ptr[i * samples_size + s] = neighbors_ptr[s * k + i]; + } + } UNPROTECT(1); - return neighbors; + return neighbors_obj; } } // extern "C" diff --git a/src/test.R b/src/test.R index dff5d83..2f4fc77 100644 --- a/src/test.R +++ b/src/test.R @@ -72,6 +72,22 @@ if (exists("testing")) { print(result$average_distance) expect_equal(result$average_distance, 0.2127081, tolerance=0.0000001); }) + + context("K-nn") + test_that("Cosine",{ + set.seed(42) + samples <- replicate(4, runif(16000)) + samples <- samples / sqrt(rowSums(samples^2)) + cls = .External("kmeans_cuda", samples, 50, tolerance=0.01, metric="cos", + seed=777, verbosity=2, yinyang_t=0) + lapply(rowSums(cls$centroids^2), function(r) expect_equal(r, 1, 0.0001)) + result = .External("knn_cuda", 20, samples, cls$centroids, + cls$assignments, metric="cos", verbosity=2) + # the result is properly validated in test.py + expect_equal(dim(result), c(16000, 20)) + expect_equal(class(result), "matrix") + expect_equal(sum(apply(result, 1, function(r) length(unique(r)))), 16000 * 20) + }) } else { testing <- TRUE cwd <- getwd() diff --git a/src/test.py b/src/test.py index e9507ad..b05ff42 100644 --- a/src/test.py +++ b/src/test.py @@ -447,6 +447,15 @@ def test_cosine_metric(self): self.assertEqual(numpy.min(assignments), 0) self.assertEqual(numpy.max(assignments), 3) + def test_cosine_metric2(self): + samples = numpy.random.random((16000, 4)).astype(numpy.float32) + samples /= numpy.linalg.norm(samples, axis=1)[:, numpy.newaxis] + centroids, assignments = kmeans_cuda( + samples, 50, metric="cos", verbosity=2, seed=3) + for c in centroids: + norm = numpy.linalg.norm(c) + self.assertTrue(0.9999 < norm < 1.0001) + def test_256_features(self): arr = numpy.random.rand(1000, 256).astype(numpy.float32) arr /= numpy.linalg.norm(arr, axis=1)[:, None] diff --git a/src/transpose.cu b/src/transpose.cu index f74ea90..af1edab 100644 --- a/src/transpose.cu +++ b/src/transpose.cu @@ -69,6 +69,17 @@ KMCUDAResult cuda_copy_sample_t( return kmcudaSuccess; } +KMCUDAResult cuda_extract_sample_t( + uint32_t index, uint32_t samples_size, uint16_t features_size, + int verbosity, const float *samples, float *dest) { + dim3 block(min(1024, features_size), 1, 1); + dim3 grid(upper(static_cast(features_size), block.x), 1, 1); + copy_sample_t<<>>( + index, samples_size, features_size, samples, dest); + CUCH(cudaDeviceSynchronize(), kmcudaRuntimeError); + return kmcudaSuccess; +} + KMCUDAResult cuda_transpose( uint32_t samples_size, uint16_t features_size, bool forward, const std::vector &devs, int verbosity, udevptrs *samples) { From 69d4c8a3a932d08c1df661b3a68b85c5bd7febf2 Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Tue, 21 Feb 2017 10:12:14 +0100 Subject: [PATCH 10/14] Fix the last bugs --- src/kmcuda.cc | 10 ++++++---- src/r.cc | 4 ++-- src/test.R | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/kmcuda.cc b/src/kmcuda.cc index 92c7c2f..0a61115 100644 --- a/src/kmcuda.cc +++ b/src/kmcuda.cc @@ -191,8 +191,8 @@ KMCUDAResult kmeans_init_centroids( uint32_t seed, const std::vector &devs, int device_ptrs, int fp16x2, int32_t verbosity, const float *host_centroids, const udevptrs &samples, udevptrs *dists, udevptrs *aux, udevptrs *centroids) { - if (metric == kmcudaDistanceMetricCosine) { - // 3 sanity checks + if (metric == kmcudaDistanceMetricCosine && !fp16x2) { + // 3 sanity checks, not implemented for fp16x2 float *probe; CUCH(cudaMallocManaged(reinterpret_cast(&probe), static_cast(features_size) * sizeof(float)), @@ -208,9 +208,11 @@ KMCUDAResult kmeans_init_centroids( float v = probe[i]; norm += v * v; } - if (norm > 1.0001 || norm < 0.9999) { + const float high = 1.00001; + const float low = 0.99999; + if (norm > high || norm < low) { INFO("error: angular distance: samples[%" PRIu32 "] has L2 norm = %f " - "which is outside [0.9999, 1.0001]\n", s, norm); + "which is outside [%f, %f]\n", s, norm, low, high); return kmcudaInvalidArguments; } } diff --git a/src/r.cc b/src/r.cc index 02c45ca..b44976b 100644 --- a/src/r.cc +++ b/src/r.cc @@ -127,14 +127,14 @@ namespace { double *samples_double = REAL(samples_chunks[i]); SEXP dims = getAttrib(samples_chunks[i], R_DimSymbol); uint32_t fsize = *features_size; - uint32_t ssize = *samples_size; + uint32_t ssize = INTEGER(dims)[0]; #pragma omp parallel for for (uint64_t f = 0; f < fsize; f++) { for (uint64_t s = 0; s < ssize; s++) { samples_float[offset + s * fsize + f] = samples_double[f * ssize + s]; } } - offset += INTEGER(dims)[0] * fsize; + offset += ssize * fsize; } } } diff --git a/src/test.R b/src/test.R index 2f4fc77..92ecc36 100644 --- a/src/test.R +++ b/src/test.R @@ -70,7 +70,7 @@ if (exists("testing")) { init="random", seed=777, verbosity=2, average_distance=TRUE) print(result$average_distance) - expect_equal(result$average_distance, 0.2127081, tolerance=0.0000001); + expect_equal(result$average_distance, 0.2124216, tolerance=0.0000001); }) context("K-nn") From 8481b036ab0a8cdcada147daf6942843f1ab11c8 Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Tue, 21 Feb 2017 12:26:09 +0100 Subject: [PATCH 11/14] Change p2p warning level to DEBUG --- src/kmcuda.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kmcuda.cc b/src/kmcuda.cc index 0a61115..7eac28f 100644 --- a/src/kmcuda.cc +++ b/src/kmcuda.cc @@ -120,7 +120,7 @@ static std::vector setup_devices(uint32_t device, int device_ptrs, int verb } auto err = cudaDeviceEnablePeerAccess(odev, 0); if (err == cudaErrorPeerAccessAlreadyEnabled) { - INFO("p2p is already enabled on gpu #%d\n", dev); + DEBUG("p2p is already enabled on gpu #%d\n", dev); } else if (err != cudaSuccess) { INFO("warning: failed to enable p2p on gpu #%d: %s\n", dev, cudaGetErrorString(err)); From a345d1482a09e5cf7ee13d2b1f9d0b4d23e949fd Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Tue, 21 Feb 2017 12:26:25 +0100 Subject: [PATCH 12/14] Add R docs --- README.md | 148 ++++++++++++++++++++++++++++++++++++++++++++++-------- src/r.cc | 20 ++++++-- 2 files changed, 143 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index a2ac5b9..5fd4678 100644 --- a/README.md +++ b/README.md @@ -18,8 +18,8 @@ ball tree. Technically, this project is a library which exports the two functions defined in `kmcuda.h`: `kmeans_cuda` and `knn_cuda`. -It has the built-in Python3 native extension support, so you can -`from libKMCUDA import kmeans_cuda`. +It has the built-in Python3 and R native extension support, so you can +`from libKMCUDA import kmeans_cuda` or `dyn.load("libKMCUDA.so")`. [![source{d}](img/sourced.png)](http://sourced.tech)

How this was created?

@@ -40,9 +40,13 @@ Table of contents * [Notes](#notes-1) * [Python examples](#python-examples) * [K-means, L2 (Euclidean) distance](#k-means-l2-euclidean-distance) - * [K-means, angular (cosine) distance average](#k-means-angular-cosine-distance--average) + * [K-means, angular (cosine) distance + average](#k-means-angular-cosine-distance--average) * [K-nn](#k-nn-1) * [Python API](#python-api) +* [R examples](#r-examples) + * [K-means](#k-means-1) + * [K-nn](#k-nn-2) +* [R API](#r-api) * [C examples](#c-examples) * [C API](#c-api) * [License](#license) @@ -123,6 +127,7 @@ It requires cudart 8.0 / Pascal and OpenMP 4.0 capable compiler. The build has been tested primarily on Linux but it works on macOS too with some blows and whistles (see "macOS" subsection). If you do not want to build the Python native module, add `-D DISABLE_PYTHON=y`. +If you do not want to build the R native module, add `-D DISABLE_R=y`. If CUDA is not automatically found, add `-D CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-8.0` (change the path to the actual one). By default, CUDA kernels are compiled for the architecture 60 (Pascal). It is possible to override it via `-D CUDA_ARCH=52`, @@ -276,7 +281,7 @@ calculated 0.276552 of all the distances Python API ---------- ```python -def kmeans_cuda(samples, clusters, tolerance=0.0, init="k-means++", +def kmeans_cuda(samples, clusters, tolerance=0.01, init="k-means++", yinyang_t=0.1, metric="L2", average_distance=False, seed=time(), device=0, verbosity=0) ``` @@ -289,18 +294,20 @@ def kmeans_cuda(samples, clusters, tolerance=0.0, init="k-means++", **clusters** integer, the number of clusters. -**tolerance** float, if the relative number of reassignments drops below this value, stop. +**tolerance** float, if the relative number of reassignments drops below this value, + algorithm stops. **init** string or numpy array, sets the method for centroids initialization, - may be "k=means++"/"kmeans++", "random" or numpy array of shape + may be "k-means++", "afk-mc2", "random" or numpy array of shape \[**clusters**, number of features\]. dtype must be float32. **yinyang_t** float, the relative number of cluster groups, usually 0.1. + 0 disables Yinyang refinement. **metric** str, the name of the distance metric to use. The default is Euclidean (L2), - can be changed to "cos" to behave as Spherical K-means with the - angular distance. Please note that samples *must* be normalized in that - case. + it can be changed to "cos" to change the algorithm to Spherical K-means + with the angular distance. Please note that samples *must* be normalized + in the latter case. **average_distance** boolean, the value indicating whether to calculate the average distance between cluster elements and @@ -309,17 +316,18 @@ def kmeans_cuda(samples, clusters, tolerance=0.0, init="k-means++", **seed** integer, random generator seed for reproducible results. -**device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device, 2 means second device, - 3 means using first and second device. Special value 0 enables all available devices. - The default is 0. +**device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device, + 2 means second device, 3 means using first and second device. Special + value 0 enables all available devices. The default is 0. **verbosity** integer, 0 means complete silence, 1 means mere progress logging, 2 means lots of output. -**return** tuple(centroids, assignments). If **samples** was a numpy array or - a host pointer tuple, the types are numpy arrays, otherwise, raw pointers - (integers) allocated on the same device. If **samples** are float16, - the returned centroids are float16 too. +**return** tuple(centroids, assignments, \[average_distance\]). + If **samples** was a numpy array or a host pointer tuple, the types + are numpy arrays, otherwise, raw pointers (integers) allocated on the + same device. If **samples** are float16, the returned centroids are + float16 too. ```python def knn_cuda(k, samples, centroids, assignments, metric="L2", device=0, verbosity=0) @@ -342,6 +350,108 @@ def knn_cuda(k, samples, centroids, assignments, metric="L2", device=0, verbosit to be compatible with uint32. If **samples** is a tuple then **assignments** is a pointer. The shape is (number of samples,). +**metric** str, the name of the distance metric to use. The default is Euclidean (L2), + it can be changed to "cos" to change the algorithm to Spherical K-means + with the angular distance. Please note that samples *must* be normalized + in the latter case. + +**device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device, + 2 means second device, 3 means using first and second device. Special + value 0 enables all available devices. The default is 0. + +**verbosity** integer, 0 means complete silence, 1 means mere progress logging, + 2 means lots of output. + +**return** neighbor indices. If **samples** was a numpy array or + a host pointer tuple, the return type is numpy array, otherwise, a + raw pointer (integer) allocated on the same device. The shape is + (number of samples, k). + +R examples +---------- +#### K-means +```R +dyn.load("libKMCUDA.so") +samples = replicate(4, runif(16000)) +result = .External("kmeans_cuda", samples, 50, tolerance=0.01, + seed=777, verbosity=1, average_distance=TRUE) +print(result$average_distance) +print(result$centroids[1:10,]) +print(result$assignments[1:10]) +``` + +#### K-nn +```R +dyn.load("libKMCUDA.so") +samples = replicate(4, runif(16000)) +cls = .External("kmeans_cuda", samples, 50, tolerance=0.01, + seed=777, verbosity=1) +result = .External("knn_cuda", 20, samples, cls$centroids, cls$assignments, + verbosity=1) +print(result[1:10,]) +``` + +R API +----- +```R +function kmeans_cuda( + samples, clusters, tolerance=0.01, init="k-means++", yinyang_t=0.1, + metric="L2", average_distance=FALSE, seed=Sys.time(), device=0, verbosity=0) +``` +**samples** real matrix of shape \[number of samples, number of features\] + or list of real matrices which are rbind()-ed internally. No more + than INT32_MAX samples and UINT16_MAX features are supported. + +**clusters** integer, the number of clusters. + +**tolerance** real, if the relative number of reassignments drops below this value, + algorithm stops. + +**init** character vector or real matrix, sets the method for centroids initialization, + may be "k-means++", "afk-mc2", "random" or real matrix, of shape + \[**clusters**, number of features\]. + +**yinyang_t** real, the relative number of cluster groups, usually 0.1. + 0 disables Yinyang refinement. + +**metric** character vector, the name of the distance metric to use. The default + is Euclidean (L2), it can be changed to "cos" to change the algorithm + to Spherical K-means with the angular distance. Please note that + samples *must* be normalized in the latter case. + +**average_distance** logical, the value indicating whether to calculate + the average distance between cluster elements and + the corresponding centroids. Useful for finding + the best K. Returned as the third list element. + +**seed** integer, random generator seed for reproducible results. + +**device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device, + 2 means second device, 3 means using first and second device. Special + value 0 enables all available devices. The default is 0. + +**verbosity** integer, 0 means complete silence, 1 means mere progress logging, + 2 means lots of output. + +**return** list(centroids, assignments\[, average_distance\]). Indices in + assignments start from 1. + +```R +function knn_cuda(k, samples, centroids, assignments, metric="L2", device=0, verbosity=0) +``` +**k** integer, the number of neighbors to search for each sample. Must be ≤ 116. + +**samples** real matrix of shape \[number of samples, number of features\] + or list of real matrices which are rbind()-ed internally. + In the latter case, is is possible to pass in more than INT32_MAX + samples. + +**centroids** real matrix with precalculated clusters' centroids (e.g., using + kmeans() or kmeans_cuda()). + +**assignments** integer vector with sample-cluster associations. Indices start + from 1. + **metric** str, the name of the distance metric to use. The default is Euclidean (L2), can be changed to "cos" to behave as Spherical K-means with the angular distance. Please note that samples *must* be normalized in that @@ -354,10 +464,8 @@ def knn_cuda(k, samples, centroids, assignments, metric="L2", device=0, verbosit **verbosity** integer, 0 means complete silence, 1 means mere progress logging, 2 means lots of output. -**return** neighbor indices. If **samples** was a numpy array or - a host pointer tuple, the return type is numpy array, otherwise, a - raw pointer (integer) allocated on the same device. The shape is - (number of samples, k). +**return** integer matrix with neighbor indices. The shape is (number of samples, k). + Indices start from 1. C examples ---------- diff --git a/src/r.cc b/src/r.cc index b44976b..1fa9a0c 100644 --- a/src/r.cc +++ b/src/r.cc @@ -301,7 +301,12 @@ static SEXP r_kmeans_cuda(SEXP args) { } } SEXP assignments2 = PROTECT(allocVector(INTSXP, samples_size)); - memcpy(INTEGER(assignments2), assignments.get(), samples_size * sizeof(int)); + uint32_t *assignments_ptr = assignments.get(); + int *assignments2_ptr = INTEGER(assignments2); + #pragma omp parallel for simd + for (uint32_t i = 0; i < samples_size; i++) { + assignments2_ptr[i] = assignments_ptr[i] + 1; + } SEXP tuple = PROTECT(allocVector(VECSXP, 2 + (average_distance_ptr != nullptr))); SET_VECTOR_ELT(tuple, 0, centroids2); SET_VECTOR_ELT(tuple, 1, assignments2); @@ -365,15 +370,20 @@ static SEXP r_knn_cuda(SEXP args) { if (static_cast(length(assignments_iter->second)) != samples_size) { error("invalid \"assignments\"'s length"); } + std::unique_ptr assignments(new uint32_t[samples_size]); + int *assignments_obj_ptr = INTEGER(assignments_iter->second); + uint32_t *assignments_ptr = assignments.get(); + #pragma omp parallel for simd + for (uint32_t i = 0; i < samples_size; i++) { + assignments_ptr[i] = assignments_obj_ptr[i] - 1; + } KMCUDADistanceMetric metric = parse_metric(kwargs); int device = parse_device(kwargs); int verbosity = parse_int(kwargs, "verbosity", 0); std::unique_ptr neighbors(new uint32_t[samples_size * k]); auto result = knn_cuda( k, metric, samples_size, features_size, clusters_size, device, -1, 0, - verbosity, samples.get(), centroids.get(), - reinterpret_cast(INTEGER(assignments_iter->second)), - neighbors.get()); + verbosity, samples.get(), centroids.get(), assignments_ptr, neighbors.get()); if (result != kmcudaSuccess) { error("knn_cuda error %d %s%s", result, kmcuda::statuses.find(result)->second, (verbosity > 0)? "" : "; " @@ -385,7 +395,7 @@ static SEXP r_knn_cuda(SEXP args) { #pragma omp parallel for for (int i = 0; i < k; i++) { for (uint32_t s = 0; s < samples_size; s++) { - neighbors_obj_ptr[i * samples_size + s] = neighbors_ptr[s * k + i]; + neighbors_obj_ptr[i * samples_size + s] = neighbors_ptr[s * k + i] + 1; } } UNPROTECT(1); From f160e2620c86593907df398d433b68995201ff2e Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Tue, 21 Feb 2017 12:46:59 +0100 Subject: [PATCH 13/14] Add R build in Travis --- .travis.linux | 3 ++- .travis.osx | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.travis.linux b/.travis.linux index 1e32db6..125d81f 100644 --- a/.travis.linux +++ b/.travis.linux @@ -1,8 +1,9 @@ #!/bin/sh +rm -rf /opt/python sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_8.0.44-1_amd64.deb sudo dpkg -i cuda-repo-ubuntu1404_8.0.44-1_amd64.deb sudo apt-get update -sudo apt-get install -y --no-install-suggests --no-install-recommends g++-5 python3-dev python3-numpy cuda-cudart-dev-8-0 cuda-curand-dev-8-0 cuda-core-8-0 cuda-misc-headers-8-0 +sudo apt-get install -y --no-install-suggests --no-install-recommends g++-5 python3-dev python3-numpy r-base-core cuda-cudart-dev-8-0 cuda-curand-dev-8-0 cuda-core-8-0 cuda-misc-headers-8-0 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 1 --slave /usr/bin/g++ g++ /usr/bin/g++-5 \ No newline at end of file diff --git a/.travis.osx b/.travis.osx index ea74d06..4af6ed3 100644 --- a/.travis.osx +++ b/.travis.osx @@ -1,7 +1,8 @@ #!/bin/sh brew install llvm --with-clang -brew install python3 +brew tap homebrew/science +brew install python3 r pip3 install numpy brew cask update brew cask install --verbose cuda From 02e824a3751eff14ef04bcd47a0e347e4fcfe7ec Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Tue, 21 Feb 2017 13:36:03 +0100 Subject: [PATCH 14/14] Fix macOS build --- src/kmcuda.h | 1 + src/r.cc | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/src/kmcuda.h b/src/kmcuda.h index 0e1d4ae..4b1c2a3 100644 --- a/src/kmcuda.h +++ b/src/kmcuda.h @@ -103,6 +103,7 @@ KMCUDAResult knn_cuda( #endif #ifdef __cplusplus +#include #include namespace { diff --git a/src/r.cc b/src/r.cc index 1fa9a0c..a8155b8 100644 --- a/src/r.cc +++ b/src/r.cc @@ -303,10 +303,17 @@ static SEXP r_kmeans_cuda(SEXP args) { SEXP assignments2 = PROTECT(allocVector(INTSXP, samples_size)); uint32_t *assignments_ptr = assignments.get(); int *assignments2_ptr = INTEGER(assignments2); + #ifndef __APPLE__ #pragma omp parallel for simd for (uint32_t i = 0; i < samples_size; i++) { assignments2_ptr[i] = assignments_ptr[i] + 1; } + #else + #pragma omp simd + for (uint32_t i = 0; i < samples_size; i++) { + assignments2_ptr[i] = assignments_ptr[i] + 1; + } + #endif SEXP tuple = PROTECT(allocVector(VECSXP, 2 + (average_distance_ptr != nullptr))); SET_VECTOR_ELT(tuple, 0, centroids2); SET_VECTOR_ELT(tuple, 1, assignments2); @@ -373,10 +380,17 @@ static SEXP r_knn_cuda(SEXP args) { std::unique_ptr assignments(new uint32_t[samples_size]); int *assignments_obj_ptr = INTEGER(assignments_iter->second); uint32_t *assignments_ptr = assignments.get(); + #ifndef __APPLE__ #pragma omp parallel for simd for (uint32_t i = 0; i < samples_size; i++) { assignments_ptr[i] = assignments_obj_ptr[i] - 1; } + #else + #pragma omp simd + for (uint32_t i = 0; i < samples_size; i++) { + assignments_ptr[i] = assignments_obj_ptr[i] - 1; + } + #endif KMCUDADistanceMetric metric = parse_metric(kwargs); int device = parse_device(kwargs); int verbosity = parse_int(kwargs, "verbosity", 0);