diff --git a/.gitignore b/.gitignore
index 1303fee..5323a25 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
 .idea
+cmake-build-*
+**/*.cbp
 **/CMakeCache.txt
 **/CMakeFiles
 **/.DS_Store
diff --git a/.travis.linux b/.travis.linux
index 1e32db6..125d81f 100644
--- a/.travis.linux
+++ b/.travis.linux
@@ -1,8 +1,9 @@
 #!/bin/sh
 
+rm -rf /opt/python
 sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
 wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_8.0.44-1_amd64.deb
 sudo dpkg -i cuda-repo-ubuntu1404_8.0.44-1_amd64.deb
 sudo apt-get update
-sudo apt-get install -y --no-install-suggests --no-install-recommends g++-5 python3-dev python3-numpy cuda-cudart-dev-8-0 cuda-curand-dev-8-0 cuda-core-8-0 cuda-misc-headers-8-0
+sudo apt-get install -y --no-install-suggests --no-install-recommends g++-5 python3-dev python3-numpy r-base-core cuda-cudart-dev-8-0 cuda-curand-dev-8-0 cuda-core-8-0 cuda-misc-headers-8-0
 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 1 --slave /usr/bin/g++ g++ /usr/bin/g++-5
\ No newline at end of file
diff --git a/.travis.osx b/.travis.osx
index ea74d06..4af6ed3 100644
--- a/.travis.osx
+++ b/.travis.osx
@@ -1,7 +1,8 @@
 #!/bin/sh
 
 brew install llvm --with-clang
-brew install python3
+brew tap homebrew/science
+brew install python3 r
 pip3 install numpy
 brew cask update
 brew cask install --verbose cuda
diff --git a/README.md b/README.md
index a2ac5b9..097543a 100644
--- a/README.md
+++ b/README.md
@@ -18,8 +18,8 @@ ball tree.
 
 Technically, this project is a library which exports the two functions
 defined in `kmcuda.h`: `kmeans_cuda` and `knn_cuda`.
-It has the built-in Python3 native extension support, so you can
-`from libKMCUDA import kmeans_cuda`.
+It has the built-in Python3 and R native extension support, so you can
+`from libKMCUDA import kmeans_cuda` or `dyn.load("libKMCUDA.so")`.
 
 [![source{d}](img/sourced.png)](http://sourced.tech)
 <p align="right"><a href="img/kmeans_image.ipynb">How this was created?</a></p>
@@ -33,16 +33,23 @@ Table of contents
       * [macOS](#macos)
 * [Testing](#testing)
 * [Benchmarks](#benchmarks)
-   * [100000x256@1024](#100000x2561024)
+   * [100,000x256@1024](#100000x2561024)
       * [Configuration](#configuration)
       * [Contestants](#contestants)
       * [Data](#data)
       * [Notes](#notes-1)
+   * [8,000,000x256@1024](#8000000x2561024)
+      * [Data](#data-1)
+      * [Notes](#notes-2)
 * [Python examples](#python-examples)
       * [K-means, L2 (Euclidean) distance](#k-means-l2-euclidean-distance)
-      * [K-means, angular (cosine) distance   average](#k-means-angular-cosine-distance--average)
+      * [K-means, angular (cosine) distance + average](#k-means-angular-cosine-distance--average)
       * [K-nn](#k-nn-1)
 * [Python API](#python-api)
+* [R examples](#r-examples)
+      * [K-means](#k-means-1)
+      * [K-nn](#k-nn-2)
+* [R API](#r-api)
 * [C examples](#c-examples)
 * [C API](#c-api)
 * [License](#license)
@@ -123,6 +130,7 @@ It requires cudart 8.0 / Pascal and OpenMP 4.0 capable compiler. The build has
 been tested primarily on Linux but it works on macOS too with some blows and whistles
 (see "macOS" subsection).
 If you do not want to build the Python native module, add `-D DISABLE_PYTHON=y`.
+If you do not want to build the R native module, add `-D DISABLE_R=y`.
 If CUDA is not automatically found, add `-D CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-8.0`
 (change the path to the actual one). By default, CUDA kernels are compiled for
 the architecture 60 (Pascal). It is possible to override it via `-D CUDA_ARCH=52`,
@@ -167,8 +175,6 @@ Benchmarks
 ----------
 
 ### 100000x256@1024
-Comparison of some KMeans implementations:
-
 |            | sklearn KMeans | KMeansRex | KMeansRex OpenMP | Serban | kmcuda | kmcuda 2 GPU |
 |------------|----------------|-----------|------------------|--------|--------|--------------|
 | time, s    | 164            | 36        | 20               | 10.6   | 9.2    | 5.5          |
@@ -193,6 +199,21 @@ Comparison of some KMeans implementations:
 #### Notes
 100000 is the maximum size Serban KMeans can handle.
 
+### 8000000x256@1024
+|            | sklearn KMeans | KMeansRex | KMeansRex OpenMP | Serban | kmcuda 2 GPU | kmcuda Yinyang 2 GPU |
+|------------|----------------|-----------|------------------|--------|--------------|----------------------|
+| time       | please no      | -         | 6h 34m           | fail   | 44m          | 36m                  |
+| memory, GB | -              | -         | 205              | fail   | 8.7          | 10.4                 |
+
+kmeans++ initialization, 93 iterations (1% reassignments equivalent).
+
+#### Data
+8,000,000 secret production samples.
+
+#### Notes
+KmeansRex did eat 205 GB of RAM on peak; it uses dynamic memory so it constantly
+bounced from 100 GB to 200 GB.
+
 Python examples
 ---------------
 
@@ -276,7 +297,7 @@ calculated 0.276552 of all the distances
 Python API
 ----------
 ```python
-def kmeans_cuda(samples, clusters, tolerance=0.0, init="k-means++",
+def kmeans_cuda(samples, clusters, tolerance=0.01, init="k-means++",
                 yinyang_t=0.1, metric="L2", average_distance=False,
                 seed=time(), device=0, verbosity=0)
 ```
@@ -289,18 +310,20 @@ def kmeans_cuda(samples, clusters, tolerance=0.0, init="k-means++",
 
 **clusters** integer, the number of clusters.
 
-**tolerance** float, if the relative number of reassignments drops below this value, stop.
+**tolerance** float, if the relative number of reassignments drops below this value,
+              algorithm stops.
 
 **init** string or numpy array, sets the method for centroids initialization,
-         may be "k=means++"/"kmeans++", "random" or numpy array of shape
+         may be "k-means++", "afk-mc2", "random" or numpy array of shape
          \[**clusters**, number of features\]. dtype must be float32.
 
 **yinyang_t** float, the relative number of cluster groups, usually 0.1.
+              0 disables Yinyang refinement.
 
 **metric** str, the name of the distance metric to use. The default is Euclidean (L2),
-           can be changed to "cos" to behave as Spherical K-means with the
-           angular distance. Please note that samples *must* be normalized in that
-           case.
+           it can be changed to "cos" to change the algorithm to Spherical K-means
+           with the angular distance. Please note that samples *must* be normalized
+           in the latter case.
 
 **average_distance** boolean, the value indicating whether to calculate
                      the average distance between cluster elements and
@@ -309,17 +332,18 @@ def kmeans_cuda(samples, clusters, tolerance=0.0, init="k-means++",
 
 **seed** integer, random generator seed for reproducible results.
 
-**device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device, 2 means second device,
-           3 means using first and second device. Special value 0 enables all available devices.
-           The default is 0.
+**device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device,
+           2 means second device, 3 means using first and second device. Special
+           value 0 enables all available devices. The default is 0.
 
 **verbosity** integer, 0 means complete silence, 1 means mere progress logging,
               2 means lots of output.
 
-**return** tuple(centroids, assignments). If **samples** was a numpy array or
-           a host pointer tuple, the types are numpy arrays, otherwise, raw pointers
-           (integers) allocated on the same device. If **samples** are float16,
-           the returned centroids are float16 too.
+**return** tuple(centroids, assignments, \[average_distance\]).
+           If **samples** was a numpy array or a host pointer tuple, the types
+           are numpy arrays, otherwise, raw pointers (integers) allocated on the
+           same device. If **samples** are float16, the returned centroids are
+           float16 too.
 
 ```python
 def knn_cuda(k, samples, centroids, assignments, metric="L2", device=0, verbosity=0)
@@ -342,6 +366,108 @@ def knn_cuda(k, samples, centroids, assignments, metric="L2", device=0, verbosit
                 to be compatible with uint32. If **samples** is a tuple then
                 **assignments** is a pointer. The shape is (number of samples,).
 
+**metric** str, the name of the distance metric to use. The default is Euclidean (L2),
+           it can be changed to "cos" to change the algorithm to Spherical K-means
+           with the angular distance. Please note that samples *must* be normalized
+           in the latter case.
+
+**device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device,
+           2 means second device, 3 means using first and second device. Special
+           value 0 enables all available devices. The default is 0.
+
+**verbosity** integer, 0 means complete silence, 1 means mere progress logging,
+              2 means lots of output.
+
+**return** neighbor indices. If **samples** was a numpy array or
+            a host pointer tuple, the return type is numpy array, otherwise, a
+            raw pointer (integer) allocated on the same device. The shape is
+            (number of samples, k).
+
+R examples
+----------
+#### K-means
+```R
+dyn.load("libKMCUDA.so")
+samples = replicate(4, runif(16000))
+result = .External("kmeans_cuda", samples, 50, tolerance=0.01,
+                   seed=777, verbosity=1, average_distance=TRUE)
+print(result$average_distance)
+print(result$centroids[1:10,])
+print(result$assignments[1:10])
+```
+
+#### K-nn
+```R
+dyn.load("libKMCUDA.so")
+samples = replicate(4, runif(16000))
+cls = .External("kmeans_cuda", samples, 50, tolerance=0.01,
+                seed=777, verbosity=1)
+result = .External("knn_cuda", 20, samples, cls$centroids, cls$assignments,
+                   verbosity=1)
+print(result[1:10,])
+```
+
+R API
+-----
+```R
+function kmeans_cuda(
+    samples, clusters, tolerance=0.01, init="k-means++", yinyang_t=0.1,
+    metric="L2", average_distance=FALSE, seed=Sys.time(), device=0, verbosity=0)
+```
+**samples** real matrix of shape \[number of samples, number of features\]
+            or list of real matrices which are rbind()-ed internally. No more
+            than INT32_MAX samples and UINT16_MAX features are supported.
+
+**clusters** integer, the number of clusters.
+
+**tolerance** real, if the relative number of reassignments drops below this value,
+              algorithm stops.
+
+**init** character vector or real matrix, sets the method for centroids initialization,
+         may be "k-means++", "afk-mc2", "random" or real matrix, of shape
+         \[**clusters**, number of features\].
+
+**yinyang_t** real, the relative number of cluster groups, usually 0.1.
+              0 disables Yinyang refinement.
+
+**metric** character vector, the name of the distance metric to use. The default
+           is Euclidean (L2), it can be changed to "cos" to change the algorithm
+           to Spherical K-means with the angular distance. Please note that
+           samples *must* be normalized in the latter case.
+
+**average_distance** logical, the value indicating whether to calculate
+                     the average distance between cluster elements and
+                     the corresponding centroids. Useful for finding
+                     the best K. Returned as the third list element.
+
+**seed** integer, random generator seed for reproducible results.
+
+**device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device,
+           2 means second device, 3 means using first and second device. Special
+           value 0 enables all available devices. The default is 0.
+
+**verbosity** integer, 0 means complete silence, 1 means mere progress logging,
+              2 means lots of output.
+
+**return** list(centroids, assignments\[, average_distance\]). Indices in
+           assignments start from 1.
+
+```R
+function knn_cuda(k, samples, centroids, assignments, metric="L2", device=0, verbosity=0)
+```
+**k** integer, the number of neighbors to search for each sample. Must be ≤ 1<sup>16</sup>.
+
+**samples** real matrix of shape \[number of samples, number of features\]
+            or list of real matrices which are rbind()-ed internally.
+            In the latter case, is is possible to pass in more than INT32_MAX
+            samples.
+
+**centroids** real matrix with precalculated clusters' centroids (e.g., using
+              kmeans() or kmeans_cuda()).
+
+**assignments** integer vector with sample-cluster associations. Indices start
+                from 1.
+
 **metric** str, the name of the distance metric to use. The default is Euclidean (L2),
                 can be changed to "cos" to behave as Spherical K-means with the
                 angular distance. Please note that samples *must* be normalized in that
@@ -354,10 +480,8 @@ def knn_cuda(k, samples, centroids, assignments, metric="L2", device=0, verbosit
 **verbosity** integer, 0 means complete silence, 1 means mere progress logging,
               2 means lots of output.
 
-**return** neighbor indices. If **samples** was a numpy array or
-            a host pointer tuple, the return type is numpy array, otherwise, a
-            raw pointer (integer) allocated on the same device. The shape is
-            (number of samples, k).
+**return** integer matrix with neighbor indices. The shape is (number of samples, k).
+           Indices start from 1.
 
 C examples
 ----------
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 319319f..77a392a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,6 @@
 cmake_minimum_required(VERSION 3.2)
 project(KMCUDA)
+set(CMAKE_MODULE_PATH ${CMAKE_HOME_DIRECTORY}/cmake)
 find_package(OpenMP REQUIRED)
 if (APPLE AND NOT CUDA_HOST_COMPILER)
   # https://gitlab.kitware.com/cmake/cmake/issues/13674
@@ -24,7 +25,9 @@ if (NOT DISABLE_PYTHON)
     execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import numpy; print(numpy.get_include())" OUTPUT_VARIABLE NUMPY_INCLUDES)
   endif()
 endif()
-
+if (NOT DISABLE_R)
+  find_package(R)
+endif()
 if (PROFILE OR CMAKE_BUILD_TYPE STREQUAL "Debug")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DPROFILE")
 endif()
@@ -35,9 +38,12 @@ endif()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -Wall -Werror -DCUDA_ARCH=${CUDA_ARCH} -std=c++11 ${OpenMP_CXX_FLAGS}")
 set(SOURCE_FILES kmcuda.cc kmcuda.h wrappers.h private.h fp_abstraction.h tricks.cuh
                  metric_abstraction.h kmeans.cu knn.cu transpose.cu)
-if (NOT DISABLE_PYTHON)
+if (PYTHONLIBS_FOUND)
   list(APPEND SOURCE_FILES python.cc)
 endif()
+if (R_FOUND)
+  list(APPEND SOURCE_FILES r.cc)
+endif()
 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
   set(NVCC_FLAGS "-G -g")
 endif()
@@ -59,10 +65,14 @@ if (APPLE)
   set(CMAKE_SHARED_LIBRARY_CXX_FLAGS "${CMAKE_SHARED_LIBRARY_CXX_FLAGS_BACKUP}")
 endif()
 target_link_libraries(KMCUDA ${CUDA_curand_LIBRARY})
-if(PYTHONLIBS_FOUND)
+if (PYTHONLIBS_FOUND)
   include_directories(${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDES})
   target_link_libraries(KMCUDA ${PYTHON_LIBRARIES})
 endif()
+if (R_FOUND)
+  include_directories(${R_INCLUDE_DIRS})
+  target_link_libraries(KMCUDA ${R_LIBRARIES})
+endif()
 if (SUFFIX)
   set_target_properties(KMCUDA PROPERTIES SUFFIX ${SUFFIX})
 endif()
diff --git a/src/cmake/FindR.cmake b/src/cmake/FindR.cmake
new file mode 100644
index 0000000..2802671
--- /dev/null
+++ b/src/cmake/FindR.cmake
@@ -0,0 +1,55 @@
+# CMake module to find R
+# - Try to find R
+# Once done, this will define
+#
+#  R_FOUND - system has R
+#  R_INCLUDE_DIRS - the R include directories
+#  R_LIBRARIES - link these to use R
+#  R_ROOT_DIR - As reported by R
+# Autor: Omar Andres Zapata Mesa 31/05/2013
+if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+  set(CMAKE_FIND_APPBUNDLE "LAST")
+endif()
+find_program(R_EXECUTABLE NAMES R R.exe)
+#---searching R installtion unsing R executable
+if(R_EXECUTABLE)
+  execute_process(COMMAND ${R_EXECUTABLE} RHOME
+                  OUTPUT_VARIABLE R_ROOT_DIR
+                  OUTPUT_STRIP_TRAILING_WHITESPACE)
+  find_path(R_INCLUDE_DIR R.h
+            HINTS ${R_ROOT_DIR}
+            PATHS /usr/local/lib /usr/local/lib64 /usr/share
+            PATH_SUFFIXES include R/include
+            DOC "Path to file R.h")
+  find_library(R_LIBRARY R
+            HINTS ${R_ROOT_DIR}/lib
+            DOC "R library (example libR.a, libR.dylib, etc.).")
+endif()
+#---setting include dirs and libraries
+set(R_LIBRARIES ${R_LIBRARY})
+set(R_INCLUDE_DIRS ${R_INCLUDE_DIR})
+foreach(_cpt ${R_FIND_COMPONENTS})
+  execute_process(COMMAND echo "cat(find.package('${_cpt}'))"
+                  COMMAND ${R_EXECUTABLE} --vanilla --slave
+                  OUTPUT_VARIABLE _cpt_path
+                  OUTPUT_STRIP_TRAILING_WHITESPACE)
+  find_library(R_${_cpt}_LIBRARY
+               lib${_cpt}.so lib${_cpt}.dylib
+               HINTS ${_cpt_path}/lib)
+  if(R_${_cpt}_LIBRARY)
+    mark_as_advanced(R_${_cpt}_LIBRARY)
+    list(APPEND R_LIBRARIES ${R_${_cpt}_LIBRARY})
+  endif()
+  find_path(R_${_cpt}_INCLUDE_DIR ${_cpt}.h HINTS  ${_cpt_path} PATH_SUFFIXES include R/include)
+  if(R_${_cpt}_INCLUDE_DIR)
+    mark_as_advanced(R_${_cpt}_INCLUDE_DIR)
+    list(APPEND R_INCLUDE_DIRS ${R_${_cpt}_INCLUDE_DIR})
+  endif()
+  if(R_${_cpt}_INCLUDE_DIR AND R_${_cpt}_LIBRARY)
+    list(REMOVE_ITEM R_FIND_COMPONENTS ${_cpt})
+  endif()
+endforeach()
+# Handle the QUIETLY and REQUIRED arguments and set R_FOUND to TRUE if all listed variables are TRUE
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(R DEFAULT_MSG R_EXECUTABLE R_INCLUDE_DIR R_LIBRARY)
+mark_as_advanced(R_FOUND R_EXECUTABLE R_INCLUDE_DIR R_LIBRARY)
diff --git a/src/kmcuda.cc b/src/kmcuda.cc
index 02079b9..7eac28f 100644
--- a/src/kmcuda.cc
+++ b/src/kmcuda.cc
@@ -120,7 +120,7 @@ static std::vector<int> setup_devices(uint32_t device, int device_ptrs, int verb
         }
         auto err = cudaDeviceEnablePeerAccess(odev, 0);
         if (err == cudaErrorPeerAccessAlreadyEnabled) {
-          INFO("p2p is already enabled on gpu #%d\n", dev);
+          DEBUG("p2p is already enabled on gpu #%d\n", dev);
         } else if (err != cudaSuccess) {
           INFO("warning: failed to enable p2p on gpu #%d: %s\n", dev,
                cudaGetErrorString(err));
@@ -191,6 +191,33 @@ KMCUDAResult kmeans_init_centroids(
     uint32_t seed, const std::vector<int> &devs, int device_ptrs, int fp16x2,
     int32_t verbosity, const float *host_centroids, const udevptrs<float> &samples,
     udevptrs<float> *dists, udevptrs<float> *aux, udevptrs<float> *centroids) {
+  if (metric == kmcudaDistanceMetricCosine && !fp16x2) {
+    // 3 sanity checks, not implemented for fp16x2
+    float *probe;
+    CUCH(cudaMallocManaged(reinterpret_cast<void**>(&probe),
+                           static_cast<uint32_t>(features_size) * sizeof(float)),
+         kmcudaMemoryAllocationFailure);
+    unique_devptr<float> managed(probe);
+    cudaSetDevice(devs[0]);
+    for (uint32_t s : {0u, samples_size / 2, samples_size - 1}) {
+      RETERR(cuda_extract_sample_t(
+          s, samples_size, features_size, verbosity, samples[0].get(), probe));
+      double norm = 0;
+      #pragma omp simd
+      for (uint16_t i = 0; i < features_size; i++) {
+        float v = probe[i];
+        norm += v * v;
+      }
+      const float high = 1.00001;
+      const float low = 0.99999;
+      if (norm > high || norm < low) {
+        INFO("error: angular distance: samples[%" PRIu32 "] has L2 norm = %f "
+             "which is outside [%f, %f]\n", s, norm, low, high);
+        return kmcudaInvalidArguments;
+      }
+    }
+  }
+
   srand(seed);
   switch (method) {
     case kmcudaInitMethodImport:
diff --git a/src/kmcuda.h b/src/kmcuda.h
index 61ca97b..4b1c2a3 100644
--- a/src/kmcuda.h
+++ b/src/kmcuda.h
@@ -102,4 +102,39 @@ KMCUDAResult knn_cuda(
 }  // extern "C"
 #endif
 
+#ifdef __cplusplus
+#include <string>
+#include <unordered_map>
+
+namespace {
+namespace kmcuda {
+const std::unordered_map<std::string, KMCUDAInitMethod> init_methods {
+    {"kmeans++", kmcudaInitMethodPlusPlus},
+    {"k-means++", kmcudaInitMethodPlusPlus},
+    {"afkmc2", kmcudaInitMethodAFKMC2},
+    {"afk-mc2", kmcudaInitMethodAFKMC2},
+    {"random", kmcudaInitMethodRandom}
+};
+
+const std::unordered_map<std::string, KMCUDADistanceMetric> metrics {
+    {"euclidean", kmcudaDistanceMetricL2},
+    {"L2", kmcudaDistanceMetricL2},
+    {"l2", kmcudaDistanceMetricL2},
+    {"cos", kmcudaDistanceMetricCosine},
+    {"cosine", kmcudaDistanceMetricCosine},
+    {"angular", kmcudaDistanceMetricCosine}
+};
+
+const std::unordered_map<int, const char *> statuses {
+    {kmcudaSuccess, "Success"},
+    {kmcudaInvalidArguments, "InvalidArguments"},
+    {kmcudaNoSuchDevice, "NoSuchDevice"},
+    {kmcudaMemoryAllocationFailure, "MemoryAllocationFailure"},
+    {kmcudaRuntimeError, "RuntimeError"},
+    {kmcudaMemoryCopyError, "MemoryCopyError"}
+};
+}  // namespace kmcuda
+}  // namespace
+#endif  // __cplusplus
+
 #endif //KMCUDA_KMCUDA_H
diff --git a/src/kmeans.cu b/src/kmeans.cu
index 2a3cbf4..f509878 100644
--- a/src/kmeans.cu
+++ b/src/kmeans.cu
@@ -437,11 +437,9 @@ __global__ void kmeans_yy_init(
   if (sample >= length) {
     return;
   }
-  bounds += static_cast<uint64_t>(sample) * (d_yy_groups_size + 1);
   for (uint32_t i = 0; i < d_yy_groups_size + 1; i++) {
-    bounds[i] = FLT_MAX;
+    bounds[static_cast<uint64_t>(length) * i + sample] = FLT_MAX;
   }
-  bounds++;
   uint32_t nearest = assignments[sample];
   extern __shared__ float shared_memory[];
   F *volatile shared_centroids = reinterpret_cast<F*>(shared_memory);
@@ -475,11 +473,12 @@ __global__ void kmeans_yy_init(
           samples, shared_centroids + (c - gc) * d_features_size,
           d_samples_size, sample + offset);
       if (c != nearest) {
-        if (dist < bounds[group]) {
-          bounds[group] = dist;
+        uint64_t gindex = static_cast<uint64_t>(length) * (1 + group) + sample;
+        if (dist < bounds[gindex]) {
+          bounds[gindex] = dist;
         }
       } else {
-        bounds[-1] = dist;
+        bounds[sample] = dist;
       }
     }
   }
@@ -549,44 +548,42 @@ __global__ void kmeans_yy_global_filter(
   if (sample >= length) {
     return;
   }
-  bounds += static_cast<uint64_t>(sample) * (d_yy_groups_size + 1);
   uint32_t cluster = assignments[sample];
   assignments_prev[sample] = cluster;
-  float upper_bound = bounds[0];
+  float upper_bound = bounds[sample];
   uint32_t doffset = d_clusters_size * d_features_size;
   float cluster_drift = drifts[doffset + cluster];
   upper_bound += cluster_drift;
-  bounds++;
   float min_lower_bound = FLT_MAX;
   for (uint32_t g = 0; g < d_yy_groups_size; g++) {
-    float lower_bound = bounds[g] - drifts[g];
-    bounds[g] = lower_bound;
+    uint64_t gindex = static_cast<uint64_t>(length) * (1 + g) + sample;
+    float lower_bound = bounds[gindex] - drifts[g];
+    bounds[gindex] = lower_bound;
     if (lower_bound < min_lower_bound) {
       min_lower_bound = lower_bound;
     }
   }
-  bounds--;
   // group filter try #1
   if (min_lower_bound >= upper_bound) {
-    bounds[0] = upper_bound;
+    bounds[sample] = upper_bound;
     return;
   }
   upper_bound = 0;
   upper_bound = METRIC<M, F>::distance_t(
       samples, centroids + cluster * d_features_size,
       d_samples_size, sample + offset);
-  bounds[0] = upper_bound;
+  bounds[sample] = upper_bound;
   // group filter try #2
   if (min_lower_bound >= upper_bound) {
     return;
   }
-  // D'oh!
+  // d'oh!
   passed[atomicAggInc(&d_passed_number)] = sample;
 }
 
 template <KMCUDADistanceMetric M, typename F>
 __global__ void kmeans_yy_local_filter(
-    const uint32_t offset, const F *__restrict__ samples,
+    const uint32_t offset, const uint32_t length, const F *__restrict__ samples,
     const uint32_t *__restrict__ passed, const F *__restrict__ centroids,
     const uint32_t *__restrict__ groups, const float *__restrict__ drifts,
     uint32_t *__restrict__ assignments, float *__restrict__ bounds) {
@@ -595,9 +592,7 @@ __global__ void kmeans_yy_local_filter(
     return;
   }
   sample = passed[sample];
-  bounds += static_cast<uint64_t>(sample) * (d_yy_groups_size + 1);
-  float upper_bound = bounds[0];
-  bounds++;
+  float upper_bound = bounds[sample];
   uint32_t cluster = assignments[sample];
   uint32_t doffset = d_clusters_size * d_features_size;
   float min_dist = upper_bound, second_min_dist = FLT_MAX;
@@ -633,7 +628,8 @@ __global__ void kmeans_yy_local_filter(
         // this may happen if the centroid is insane (NaN)
         continue;
       }
-      float lower_bound = bounds[group];
+      float lower_bound = bounds[
+          static_cast<uint64_t>(length) * (1 + group) + sample];
       if (lower_bound >= upper_bound) {
         if (lower_bound < second_min_dist) {
           second_min_dist = lower_bound;
@@ -658,14 +654,17 @@ __global__ void kmeans_yy_local_filter(
   }
   uint32_t nearest_group = groups[nearest];
   uint32_t previous_group = groups[cluster];
-  bounds[nearest_group] = second_min_dist;
+  bounds[static_cast<uint64_t>(length) * (1 + nearest_group) + sample] =
+      second_min_dist;
   if (nearest_group != previous_group) {
-    float pb = bounds[previous_group];
+    uint64_t gindex =
+        static_cast<uint64_t>(length) * (1 + previous_group) + sample;
+    float pb = bounds[gindex];
     if (pb > upper_bound) {
-      bounds[previous_group] = upper_bound;
+      bounds[gindex] = upper_bound;
     }
   }
-  bounds[-1] = min_dist;
+  bounds[sample] = min_dist;
   if (cluster != nearest) {
     assignments[sample] = nearest;
     atomicAggInc(&d_changed_number);
@@ -1244,7 +1243,7 @@ KMCUDAResult kmeans_cuda_yy(
           (*bounds_yy)[devi].get(), (*passed_yy)[devi].get()));
       dim3 slgrid(upper(length, slblock.x), 1, 1);
       KERNEL_SWITCH(kmeans_yy_local_filter, <<<slgrid, slblock, shmem_sizes[devi]>>>(
-          offset, reinterpret_cast<const F*>(samples[devi].get()),
+          offset, length, reinterpret_cast<const F*>(samples[devi].get()),
           (*passed_yy)[devi].get(), reinterpret_cast<const F*>((*centroids)[devi].get()),
           (*assignments_yy)[devi].get(), (*drifts_yy)[devi].get(),
           (*assignments)[devi].get() + offset, (*bounds_yy)[devi].get()));
diff --git a/src/private.h b/src/private.h
index 1e93c89..0ea04e2 100644
--- a/src/private.h
+++ b/src/private.h
@@ -259,6 +259,10 @@ KMCUDAResult cuda_copy_sample_t(
     const std::vector<int> &devs, int verbosity, const udevptrs<float> &samples,
     udevptrs<float> *dest);
 
+KMCUDAResult cuda_extract_sample_t(
+    uint32_t index, uint32_t samples_size, uint16_t features_size,
+    int verbosity, const float *samples, float *dest);
+
 KMCUDAResult cuda_transpose(
     uint32_t samples_size, uint16_t features_size, bool forward,
     const std::vector<int> &devs, int verbosity, udevptrs<float> *samples);
diff --git a/src/python.cc b/src/python.cc
index 94ad02a..879d0fa 100644
--- a/src/python.cc
+++ b/src/python.cc
@@ -68,22 +68,6 @@ class _pyobj : public pyobj_parent<O> {
 using pyobj = _pyobj<PyObject>;
 using pyarray = _pyobj<PyArrayObject>;
 
-static const std::unordered_map<std::string, KMCUDAInitMethod> init_methods {
-    {"kmeans++", kmcudaInitMethodPlusPlus},
-    {"k-means++", kmcudaInitMethodPlusPlus},
-    {"afkmc2", kmcudaInitMethodAFKMC2},
-    {"random", kmcudaInitMethodRandom}
-};
-
-static const std::unordered_map<std::string, KMCUDADistanceMetric > metrics {
-    {"euclidean", kmcudaDistanceMetricL2},
-    {"L2", kmcudaDistanceMetricL2},
-    {"l2", kmcudaDistanceMetricL2},
-    {"cos", kmcudaDistanceMetricCosine},
-    {"cosine", kmcudaDistanceMetricCosine},
-    {"angular", kmcudaDistanceMetricCosine}
-};
-
 static void set_cuda_malloc_error() {
   PyErr_SetString(PyExc_MemoryError, "Failed to allocate memory on GPU");
 }
@@ -106,8 +90,8 @@ static bool get_metric(PyObject *metric_obj, KMCUDADistanceMetric *metric) {
     return false;
   } else {
     pyobj bytes(PyUnicode_AsASCIIString(metric_obj));
-    auto immetric = metrics.find(PyBytes_AsString(bytes.get()));
-    if (immetric == metrics.end()) {
+    auto immetric = kmcuda::metrics.find(PyBytes_AsString(bytes.get()));
+    if (immetric == kmcuda::metrics.end()) {
       PyErr_SetString(
           PyExc_ValueError,
           "Unknown metric. Supported values are \"L2\" and \"cos\".");
@@ -193,8 +177,8 @@ static PyObject *py_kmeans_cuda(PyObject *self, PyObject *args, PyObject *kwargs
   KMCUDAInitMethod init;
   auto set_init = [&init](PyObject *obj) {
     pyobj bytes(PyUnicode_AsASCIIString(obj));
-    auto iminit = init_methods.find(PyBytes_AsString(bytes.get()));
-    if (iminit == init_methods.end()) {
+    auto iminit = kmcuda::init_methods.find(PyBytes_AsString(bytes.get()));
+    if (iminit == kmcuda::init_methods.end()) {
       PyErr_SetString(
           PyExc_ValueError,
           "Unknown centroids initialization method. Supported values are "
diff --git a/src/r.cc b/src/r.cc
new file mode 100644
index 0000000..a8155b8
--- /dev/null
+++ b/src/r.cc
@@ -0,0 +1,419 @@
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include <R.h>
+#include <Rinternals.h>
+#include <R_ext/Rdynload.h>
+#include "kmcuda.h"
+
+namespace {
+  std::unordered_map<std::string, SEXP> parse_args(
+      const std::unordered_set<std::string> &allowed,
+      std::initializer_list<std::string> required, SEXP args) {
+    std::unordered_map<std::string, SEXP> result;
+    args = CDR(args);
+    bool pure = true;
+    for (unsigned i = 0; args != R_NilValue; i++, args = CDR(args)) {
+      SEXP arg = CAR(args);
+      if (isNull(TAG(args))) {
+        if (pure && i == result.size()) {
+          result.emplace(required.begin()[i], arg);
+        } else {
+          error("positional argument follows keyword argument");
+        }
+      } else {
+        pure = false;
+        const char *name = CHAR(PRINTNAME(TAG(args)));
+        if (allowed.find(name) == allowed.end()) {
+          error("got an unexpected keyword argument \"%s\"", name);
+        }
+        result.emplace(name, arg);
+      }
+    }
+    return result;
+  }
+
+  template<typename R>
+  R parse_dict(const std::unordered_map<std::string, R>& dict, const char *arg,
+               SEXP name) {
+    if (!isString(name)) {
+      error("\"%s\" name must be a string", arg);
+    }
+    const char *init_name = CHAR(asChar(name));
+    auto init_iter = dict.find(init_name);
+    if (init_iter == dict.end()) {
+      error("\"%s\" = \"%s\" is not supported", arg, init_name);
+    }
+    return init_iter->second;
+  }
+
+  int parse_int(SEXP value) {
+    if (isInteger(value)) {
+      return INTEGER(value)[0];
+    }
+    return REAL(value)[0];
+  }
+
+  int parse_int(const std::unordered_map<std::string, SEXP> &kwargs,
+                const std::string &name, int def) {
+    auto iter = kwargs.find(name);
+    if (iter == kwargs.end()) {
+      return def;
+    }
+    if (!isNumeric(iter->second)) {
+      error("\"%s\" must be an integer", name.c_str());
+    }
+    return parse_int(iter->second);
+  }
+
+  void parse_samples(
+      const std::unordered_map<std::string, SEXP> &kwargs,
+      std::unique_ptr<float[]> *samples, uint32_t *samples_size,
+      uint16_t *features_size) {
+    std::unique_ptr<SEXP[]> samples_chunks;
+    int chunks_size = 0;
+    {
+      auto samples_iter = kwargs.find("samples");
+      if (samples_iter == kwargs.end()) {
+        error("\"samples\" must be defined");
+      }
+      SEXP samples_obj = samples_iter->second;
+      if (isReal(samples_obj)) {
+        chunks_size = 1;
+        samples_chunks.reset(new SEXP[1]);
+        samples_chunks[0] = samples_obj;
+      } else if (TYPEOF(samples_obj) == VECSXP) {
+        chunks_size = length(samples_obj);
+        samples_chunks.reset(new SEXP[chunks_size]);
+        for (int i = 0; i < chunks_size; i++) {
+          samples_chunks[i] = VECTOR_ELT(samples_obj, i);
+        }
+      } else {
+        error("\"samples\" must be a 2D real matrix or a vector of 2D real matrices");
+      }
+    }
+    *samples_size = 0;
+    for (int i = 0; i < chunks_size; i++) {
+      if (!isReal(samples_chunks[i])) {
+        error("\"samples\" must be a 2D real matrix or a vector of 2D real matrices");
+      }
+      SEXP dims = getAttrib(samples_chunks[i], R_DimSymbol);
+      if (length(dims) != 2) {
+        error("\"samples\" must be a 2D real matrix or a vector of 2D real matrices");
+      }
+      int samples_size_i = INTEGER(dims)[0];
+      if (static_cast<int64_t>(*samples_size) + samples_size_i > INT32_MAX) {
+        error("too many samples (>INT32_MAX)");
+      }
+      *samples_size += samples_size_i;
+      int features_size_i = INTEGER(dims)[1];
+      if (features_size_i > UINT16_MAX) {
+        error("too many features (>UINT16_MAX)");
+      }
+      if (i == 0) {
+        *features_size = features_size_i;
+      } else if (*features_size != features_size_i) {
+        error("\"samples\" vector contains matrices with different number of columns");
+      }
+    }
+    samples->reset(new float[
+        static_cast<uint64_t>(*samples_size) * *features_size]);
+    float *samples_float = samples->get();
+    {
+      int offset = 0;
+      for (int i = 0; i < chunks_size; i++) {
+        double *samples_double = REAL(samples_chunks[i]);
+        SEXP dims = getAttrib(samples_chunks[i], R_DimSymbol);
+        uint32_t fsize = *features_size;
+        uint32_t ssize = INTEGER(dims)[0];
+        #pragma omp parallel for
+        for (uint64_t f = 0; f < fsize; f++) {
+          for (uint64_t s = 0; s < ssize; s++) {
+            samples_float[offset + s * fsize + f] = samples_double[f * ssize + s];
+          }
+        }
+        offset += ssize * fsize;
+      }
+    }
+  }
+
+  KMCUDADistanceMetric parse_metric(
+      const std::unordered_map<std::string, SEXP> &kwargs) {
+    KMCUDADistanceMetric metric = kmcudaDistanceMetricL2;
+    auto metric_iter = kwargs.find("metric");
+    if (metric_iter != kwargs.end()) {
+      metric = parse_dict(kmcuda::metrics, "metric", metric_iter->second);
+    }
+    return metric;
+  }
+
+  int parse_device(
+      const std::unordered_map<std::string, SEXP> &kwargs) {
+    int device = parse_int(kwargs, "device", 0);
+    if (device < 0) {
+      error("\"device\" may not be negative");
+    }
+    return device;
+  }
+
+  static const std::unordered_set<std::string> kmeans_kwargs {
+      "samples", "clusters", "tolerance", "init", "yinyang_t", "metric",
+      "average_distance", "seed", "device", "verbosity"
+  };
+
+  static const std::unordered_set<std::string> knn_kwargs {
+      "k", "samples", "centroids", "assignments", "metric", "device",
+      "verbosity"
+  };
+}
+
+extern "C" {
+
+static SEXP r_kmeans_cuda(SEXP args);
+static SEXP r_knn_cuda(SEXP args);
+
+static R_ExternalMethodDef externalMethods[] = {
+   {"kmeans_cuda", (DL_FUNC) &r_kmeans_cuda, -1},
+   {"knn_cuda", (DL_FUNC) &r_knn_cuda, -1},
+   {NULL, NULL, 0}
+};
+
+void R_init_libKMCUDA(DllInfo *info) {
+  R_registerRoutines(info, NULL, NULL, NULL, externalMethods);
+}
+
+static SEXP r_kmeans_cuda(SEXP args) {
+  auto kwargs = parse_args(kmeans_kwargs, {"samples", "clusters"}, args);
+  std::unique_ptr<float[]> samples;
+  uint32_t samples_size;
+  uint16_t features_size;
+  parse_samples(kwargs, &samples, &samples_size, &features_size);
+  SEXP clusters_obj = kwargs["clusters"];
+  if (!isNumeric(clusters_obj)) {
+    error("\"clusters\" must be a positive integer");
+  }
+  int clusters_size = parse_int(clusters_obj);
+  if (clusters_size <= 0) {
+    error("\"clusters\" must be a positive integer");
+  }
+  if (static_cast<uint64_t>(clusters_size) * features_size > INT32_MAX
+      || static_cast<uint32_t>(clusters_size) >= samples_size) {
+    error("\"clusters\" is too big");
+  }
+  auto centroids = std::unique_ptr<float[]>(
+      new float[clusters_size * features_size]);
+  KMCUDAInitMethod init = kmcudaInitMethodPlusPlus;
+  int afkmc2_m = 0;
+  auto init_iter = kwargs.find("init");
+  if (init_iter != kwargs.end()) {
+    if (isString(init_iter->second)) {
+      init = parse_dict(kmcuda::init_methods, "init", init_iter->second);
+    } else if (TYPEOF(init_iter->second) == VECSXP) {
+      if (length(init_iter->second) == 0) {
+        error("\"init\" may not be an empty list");
+      }
+      init = parse_dict(kmcuda::init_methods, "init", CAR(init_iter->second));
+      if (init == kmcudaInitMethodAFKMC2 && length(init_iter->second) > 1) {
+        SEXP afkmc2_m_obj = CAAR(init_iter->second);
+        if (!isNumeric(afkmc2_m_obj)) {
+          error("\"init\" = %s: parameter must be a positive integer",
+                CHAR(asChar(CAR(init_iter->second))));
+        }
+        afkmc2_m = parse_int(afkmc2_m_obj);
+        if (afkmc2_m <= 0) {
+          error("\"init\" = %s: parameter must be a positive integer",
+                CHAR(asChar(CAR(init_iter->second))));
+        }
+      } else if (length(init_iter->second) != 1) {
+        error("\"init\" has wrong number of parameters");
+      }
+    } else if (isReal(init_iter->second)) {
+      init = kmcudaInitMethodImport;
+      SEXP dims = getAttrib(init_iter->second, R_DimSymbol);
+      if (length(dims) != 2
+          || INTEGER(dims)[0] != clusters_size
+          || INTEGER(dims)[1] != features_size) {
+        error("invalid centroids dimensions in \"init\"");
+      }
+      double *centroids_double = REAL(init_iter->second);
+      #pragma omp parallel for
+      for (uint64_t f = 0; f < features_size; f++) {
+        for (int64_t c = 0; c < clusters_size; c++) {
+          centroids[c * features_size + f] = centroids_double[f * clusters_size + c];
+        }
+      }
+    } else {
+      error("\"init\" must be either a string or a list or a 2D matrix");
+    }
+  }
+  float tolerance = 0.01;
+  auto tolerance_iter = kwargs.find("tolerance");
+  if (tolerance_iter != kwargs.end()) {
+    if (!isReal(tolerance_iter->second)) {
+      error("\"tolerance\" must be a real value");
+    }
+    tolerance = REAL(tolerance_iter->second)[0];
+    if (tolerance < 0 || tolerance > 1) {
+      error("\"tolerance\" must be in [0, 1]");
+    }
+  }
+  float yinyang_t = 0.1;
+  auto yinyang_t_iter = kwargs.find("yinyang_t");
+  if (yinyang_t_iter != kwargs.end()) {
+    if (!isReal(yinyang_t_iter->second)) {
+      error("\"yinyang_t\" must be a real value");
+    }
+    yinyang_t = REAL(yinyang_t_iter->second)[0];
+    if (yinyang_t < 0 || yinyang_t > 0.5) {
+      error("\"tolerance\" must be in [0, 0.5]");
+    }
+  }
+  KMCUDADistanceMetric metric = parse_metric(kwargs);
+  uint32_t seed = parse_int(kwargs, "seed", time(NULL));
+  int device = parse_device(kwargs);
+  int verbosity = parse_int(kwargs, "verbosity", 0);
+  float average_distance, *average_distance_ptr = nullptr;
+  auto average_distance_iter = kwargs.find("average_distance");
+  if (average_distance_iter != kwargs.end()) {
+    if (LOGICAL(average_distance_iter->second)[0]) {
+      average_distance_ptr = &average_distance;
+    }
+  }
+  auto assignments = std::unique_ptr<uint32_t[]>(new uint32_t[samples_size]);
+  auto result = kmeans_cuda(
+    init, &afkmc2_m, tolerance, yinyang_t, metric, samples_size, features_size,
+    clusters_size, seed, device, -1, 0, verbosity, samples.get(),
+    centroids.get(), assignments.get(), average_distance_ptr);
+  if (result != kmcudaSuccess) {
+    error("kmeans_cuda error %d %s%s", result,
+          kmcuda::statuses.find(result)->second, (verbosity > 0)? "" : "; "
+            "\"verbosity\" = 2 would reveal the details");
+  }
+  SEXP centroids2 = PROTECT(allocMatrix(REALSXP, clusters_size, features_size));
+  double *centroids_double = REAL(centroids2);
+  float *centroids_float = centroids.get();
+  #pragma omp parallel for
+  for (uint64_t f = 0; f < features_size; f++) {
+    for (int64_t c = 0; c < clusters_size; c++) {
+      centroids_double[f * clusters_size + c] = centroids_float[c * features_size + f];
+    }
+  }
+  SEXP assignments2 = PROTECT(allocVector(INTSXP, samples_size));
+  uint32_t *assignments_ptr = assignments.get();
+  int *assignments2_ptr = INTEGER(assignments2);
+  #ifndef __APPLE__
+  #pragma omp parallel for simd
+  for (uint32_t i = 0; i < samples_size; i++) {
+    assignments2_ptr[i] = assignments_ptr[i] + 1;
+  }
+  #else
+  #pragma omp simd
+  for (uint32_t i = 0; i < samples_size; i++) {
+    assignments2_ptr[i] = assignments_ptr[i] + 1;
+  }
+  #endif
+  SEXP tuple = PROTECT(allocVector(VECSXP, 2 + (average_distance_ptr != nullptr)));
+  SET_VECTOR_ELT(tuple, 0, centroids2);
+  SET_VECTOR_ELT(tuple, 1, assignments2);
+  SEXP names = PROTECT(allocVector(
+      STRSXP, 2 + (average_distance_ptr != nullptr)));
+  SET_STRING_ELT(names, 0, mkChar("centroids"));
+  SET_STRING_ELT(names, 1, mkChar("assignments"));
+  if (average_distance_ptr != nullptr) {
+    SEXP average_distance2 = PROTECT(allocVector(REALSXP, 1));
+    REAL(average_distance2)[0] = average_distance;
+    SET_VECTOR_ELT(tuple, 2, average_distance2);
+    SET_STRING_ELT(names, 2, mkChar("average_distance"));
+  }
+  setAttrib(tuple, R_NamesSymbol, names);
+  UNPROTECT(4 + (average_distance_ptr != nullptr));
+  return tuple;
+}
+
+static SEXP r_knn_cuda(SEXP args) {
+  auto kwargs = parse_args(
+      knn_kwargs, {"k", "samples", "centroids", "assignments"}, args);
+  int k = parse_int(kwargs, "k", 0);
+  if (k <= 0) {
+    error("\"k\" must be positive");
+  }
+  std::unique_ptr<float[]> samples;
+  uint32_t samples_size;
+  uint16_t features_size;
+  parse_samples(kwargs, &samples, &samples_size, &features_size);
+  if (static_cast<uint64_t>(samples_size) * k > INT32_MAX) {
+    error("too big \"k\": dim(samples)[0] * k > INT32_MAX");
+  }
+  auto centroids_iter = kwargs.find("centroids");
+  if (centroids_iter == kwargs.end()) {
+    error("\"centroids\" must be specified");
+  }
+  if (!isReal(centroids_iter->second)) {
+    error("\"centroids\" must be a 2D real matrix");
+  }
+  SEXP dims = getAttrib(centroids_iter->second, R_DimSymbol);
+  if (length(dims) != 2 || INTEGER(dims)[1] != features_size) {
+    error("invalid \"centroids\"'s dimensions");
+  }
+  int clusters_size = INTEGER(dims)[0];
+  std::unique_ptr<float[]> centroids(new float[clusters_size * features_size]);
+  double *centroids_double = REAL(centroids_iter->second);
+  float *centroids_float = centroids.get();
+  #pragma omp parallel for
+  for (uint64_t f = 0; f < features_size; f++) {
+    for (int64_t c = 0; c < clusters_size; c++) {
+      centroids_float[c * features_size + f] = centroids_double[f * clusters_size + c];
+    }
+  }
+  auto assignments_iter = kwargs.find("assignments");
+  if (assignments_iter == kwargs.end()) {
+    error("\"assignments\" must be specified");
+  }
+  if (!isInteger(assignments_iter->second)) {
+    error("\"assignments\" must be an integer vector");
+  }
+  if (static_cast<uint32_t>(length(assignments_iter->second)) != samples_size) {
+    error("invalid \"assignments\"'s length");
+  }
+  std::unique_ptr<uint32_t[]> assignments(new uint32_t[samples_size]);
+  int *assignments_obj_ptr = INTEGER(assignments_iter->second);
+  uint32_t *assignments_ptr = assignments.get();
+  #ifndef __APPLE__
+  #pragma omp parallel for simd
+  for (uint32_t i = 0; i < samples_size; i++) {
+    assignments_ptr[i] = assignments_obj_ptr[i] - 1;
+  }
+  #else
+  #pragma omp simd
+  for (uint32_t i = 0; i < samples_size; i++) {
+    assignments_ptr[i] = assignments_obj_ptr[i] - 1;
+  }
+  #endif
+  KMCUDADistanceMetric metric = parse_metric(kwargs);
+  int device = parse_device(kwargs);
+  int verbosity = parse_int(kwargs, "verbosity", 0);
+  std::unique_ptr<uint32_t[]> neighbors(new uint32_t[samples_size * k]);
+  auto result = knn_cuda(
+      k, metric, samples_size, features_size, clusters_size, device, -1, 0,
+      verbosity, samples.get(), centroids.get(), assignments_ptr, neighbors.get());
+  if (result != kmcudaSuccess) {
+    error("knn_cuda error %d %s%s", result,
+          kmcuda::statuses.find(result)->second, (verbosity > 0)? "" : "; "
+            "\"verbosity\" = 2 would reveal the details");
+  }
+  SEXP neighbors_obj = PROTECT(allocMatrix(INTSXP, samples_size, k));
+  const uint32_t *neighbors_ptr = neighbors.get();
+  int *neighbors_obj_ptr = INTEGER(neighbors_obj);
+  #pragma omp parallel for
+  for (int i = 0; i < k; i++) {
+    for (uint32_t s = 0; s < samples_size; s++) {
+      neighbors_obj_ptr[i * samples_size + s] = neighbors_ptr[s * k + i] + 1;
+    }
+  }
+  UNPROTECT(1);
+  return neighbors_obj;
+}
+
+}  // extern "C"
diff --git a/src/test.R b/src/test.R
new file mode 100644
index 0000000..92ecc36
--- /dev/null
+++ b/src/test.R
@@ -0,0 +1,105 @@
+library(testthat)
+
+if (exists("testing")) {
+  setwd(cwd)
+  dyn.load("libKMCUDA.so")
+
+  context("K-means")
+  test_that("Random",{
+    set.seed(42)
+    samples <- replicate(4, runif(16000))
+    result = .External("kmeans_cuda", samples, 50, tolerance=0.01,
+                       init="random", seed=777, verbosity=2)
+    kmcuda_asses = replicate(1, result$assignments)
+    attach(kmeans(samples, result$centroids, iter.max=1))
+    reasses = length(intersect(kmcuda_asses, cluster)) / length(cluster)
+    print(sprintf("Reassignments: %f", reasses))
+    expect_lt(reasses, 0.01)
+  })
+  test_that("SingleDeviceKmeans++Lloyd",{
+    set.seed(42)
+    samples <- replicate(4, runif(16000))
+    result = .External("kmeans_cuda", samples, 50, yinyang_t=0, device=1,
+                       init="kmeans++", seed=777, verbosity=2)
+    kmcuda_asses = replicate(1, result$assignments)
+    attach(kmeans(samples, result$centroids, iter.max=1))
+    reasses = length(intersect(kmcuda_asses, cluster)) / length(cluster)
+    print(sprintf("Reassignments: %f", reasses))
+    expect_lt(reasses, 0.01)
+  })
+  test_that("MultiSamples",{
+    set.seed(42)
+    samples1 <- replicate(4, runif(16000))
+    samples2 <- replicate(4, runif(16000))
+    result = .External("kmeans_cuda", list(samples1, samples2), 50,
+                       init="kmeans++", seed=777, verbosity=2)
+    kmcuda_asses = replicate(1, result$assignments)
+    expect_equal(length(kmcuda_asses), 32000)
+    attach(kmeans(rbind(samples1, samples2), result$centroids, iter.max=1))
+    reasses = length(intersect(kmcuda_asses, cluster)) / length(cluster)
+    print(sprintf("Reassignments: %f", reasses))
+    expect_lt(reasses, 0.01)
+  })
+  test_that("AFK-MC2",{
+    set.seed(42)
+    samples <- replicate(4, runif(16000))
+    result = .External("kmeans_cuda", samples, 50, tolerance=0.01,
+                       init=c("afkmc2", 100), seed=777, verbosity=2)
+    kmcuda_asses = replicate(1, result$assignments)
+    attach(kmeans(samples, result$centroids, iter.max=1))
+    reasses = length(intersect(kmcuda_asses, cluster)) / length(cluster)
+    print(sprintf("Reassignments: %f", reasses))
+    expect_lt(reasses, 0.01)
+  })
+  test_that("ImportCentroids",{
+    set.seed(42)
+    samples <- replicate(4, runif(16000))
+    centroids <- replicate(4, runif(50))
+    result = .External("kmeans_cuda", samples, 50, tolerance=0.01,
+                       init=centroids, seed=777, verbosity=2)
+    kmcuda_asses = replicate(1, result$assignments)
+    attach(kmeans(samples, result$centroids, iter.max=1))
+    reasses = length(intersect(kmcuda_asses, cluster)) / length(cluster)
+    print(sprintf("Reassignments: %f", reasses))
+    expect_lt(reasses, 0.01)
+  })
+  test_that("RandomPlusAverageDistance",{
+    set.seed(42)
+    samples <- replicate(4, runif(16000))
+    result = .External("kmeans_cuda", samples, 50, tolerance=0.01,
+                       init="random", seed=777, verbosity=2,
+                       average_distance=TRUE)
+    print(result$average_distance)
+    expect_equal(result$average_distance, 0.2124216, tolerance=0.0000001);
+  })
+
+  context("K-nn")
+  test_that("Cosine",{
+    set.seed(42)
+    samples <- replicate(4, runif(16000))
+    samples <- samples / sqrt(rowSums(samples^2))
+    cls = .External("kmeans_cuda", samples, 50, tolerance=0.01, metric="cos",
+                    seed=777, verbosity=2, yinyang_t=0)
+    lapply(rowSums(cls$centroids^2), function(r) expect_equal(r, 1, 0.0001))
+    result = .External("knn_cuda", 20, samples, cls$centroids,
+                       cls$assignments, metric="cos", verbosity=2)
+    # the result is properly validated in test.py
+    expect_equal(dim(result), c(16000, 20))
+    expect_equal(class(result), "matrix")
+    expect_equal(sum(apply(result, 1, function(r) length(unique(r)))), 16000 * 20)
+  })
+} else {
+  testing <- TRUE
+  cwd <- getwd()
+  thisFile <- function() {
+    cmdArgs <- commandArgs(trailingOnly=FALSE)
+    needle <- "--file="
+    match <- grep(needle, cmdArgs)
+    if (length(match) > 0) {
+      return(normalizePath(sub(needle, "", cmdArgs[match])))
+    } else {
+      return(normalizePath(sys.frames()[[1]]$ofile))
+    }
+  }
+  test_results <- test_dir(dirname(thisFile()), reporter="summary")
+}
diff --git a/src/test.py b/src/test.py
index e9507ad..b05ff42 100644
--- a/src/test.py
+++ b/src/test.py
@@ -447,6 +447,15 @@ def test_cosine_metric(self):
         self.assertEqual(numpy.min(assignments), 0)
         self.assertEqual(numpy.max(assignments), 3)
 
+    def test_cosine_metric2(self):
+        samples = numpy.random.random((16000, 4)).astype(numpy.float32)
+        samples /= numpy.linalg.norm(samples, axis=1)[:, numpy.newaxis]
+        centroids, assignments = kmeans_cuda(
+            samples, 50, metric="cos", verbosity=2, seed=3)
+        for c in centroids:
+            norm = numpy.linalg.norm(c)
+            self.assertTrue(0.9999 < norm < 1.0001)
+
     def test_256_features(self):
         arr = numpy.random.rand(1000, 256).astype(numpy.float32)
         arr /= numpy.linalg.norm(arr, axis=1)[:, None]
diff --git a/src/transpose.cu b/src/transpose.cu
index f74ea90..af1edab 100644
--- a/src/transpose.cu
+++ b/src/transpose.cu
@@ -69,6 +69,17 @@ KMCUDAResult cuda_copy_sample_t(
   return kmcudaSuccess;
 }
 
+KMCUDAResult cuda_extract_sample_t(
+    uint32_t index, uint32_t samples_size, uint16_t features_size,
+    int verbosity, const float *samples, float *dest) {
+  dim3 block(min(1024, features_size), 1, 1);
+  dim3 grid(upper(static_cast<unsigned>(features_size), block.x), 1, 1);
+  copy_sample_t<<<grid, block>>>(
+      index, samples_size, features_size, samples, dest);
+  CUCH(cudaDeviceSynchronize(), kmcudaRuntimeError);
+  return kmcudaSuccess;
+}
+
 KMCUDAResult cuda_transpose(
     uint32_t samples_size, uint16_t features_size, bool forward,
     const std::vector<int> &devs, int verbosity, udevptrs<float> *samples) {