diff --git a/NAMESPACE b/NAMESPACE index 9d1ea7d..9773325 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,7 @@ # Generated by roxygen2: do not edit by hand S3method("[",dist) -S3method(as.matrix,subdist) +export(find_medoids) export(use_distance) export(use_distops) importFrom(Rcpp,sourceCpp) diff --git a/NEWS.md b/NEWS.md index f7cfa3d..03e9b82 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,13 +10,16 @@ for package developers to define TBB-parallelized functions to compute pairwise distance matrices using a custom C++-implemented distance function. * Provides subset operator [`[.dist`](https://lmjl-alea.github.io/distops/reference/sub-.dist.html) to -subset a distance matrix. The returned object is an object of class `subdist` -which can be turned into a matrix by -[`as.matrix.subdist()`](https://lmjl-alea.github.io/distops/reference/as.matrix_subdist.html). +subset a distance matrix. The returned object is either of class `dist` if both +row and column indices are identical or of class `matrix` if both row and column +indices are different. +* Provides function `find_medoids(D, memberships = NULL)` to find medoid(s) of a +distance matrix. If `memberships` is provided, one medoid per cluster is +returned. Otherwise, a single overall medoid is returned. ## ToDo List -* Pass a list instead of a matrix to be more general. +* Pass a list instead of a matrix to be more general? * Use Arrow parquet format to store distance matrix in multiple files when sample size exceeds 10,000 or something like that. * Use Arrow connection to read in large data. diff --git a/R/RcppExports.R b/R/RcppExports.R index 9750cde..136820b 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,6 +1,14 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +GetMedoid <- function(distanceValues) { + .Call(`_distops_GetMedoid`, distanceValues) +} + +GetMedoids <- function(distanceValues, groupingValues) { + .Call(`_distops_GetMedoids`, distanceValues, groupingValues) +} + DiagonalSubsetter <- function(x, indices) { .Call(`_distops_DiagonalSubsetter`, x, indices) } diff --git a/R/medoids.R b/R/medoids.R new file mode 100644 index 0000000..6a1669b --- /dev/null +++ b/R/medoids.R @@ -0,0 +1,30 @@ +#' Finds the medoids from a distance matrix +#' +#' @description This function finds the medoids from a distance matrix. The +#' medoid is the object that minimizes the sum of distances to all other +#' objects. This function takes advantage of the **RcppParallel** package to +#' compute the medoids in parallel. +#' +#' @param D An object of class [`stats::dist`]. +#' @param memberships A factor specifying the cluster memberships of the +#' objects. +#' +#' @return A named integer vector specifying the indices of the medoids. +#' @export +#' +#' @examples +#' D <- stats::dist(iris[, 1:4]) +#' find_medoids(D) +#' memberships <- as.factor(rep(1:3, each = 50L)) +#' find_medoids(D, memberships) +find_medoids <- function(D, memberships = NULL) { + if (!inherits(D, "dist")) + cli::cli_abort("The input argument {.arg D} must be of class {.cls dist}.") + if (is.null(memberships)) + return(GetMedoid(D)) +if (!is.factor(memberships)) + cli::cli_abort("The input argument {.arg memberships} must be of class {.cls factor}.") + out <- GetMedoids(D, as.numeric(memberships)) + names(out) <- levels(memberships) + out +} diff --git a/R/subset.R b/R/subset.R index 15b9853..36934d6 100644 --- a/R/subset.R +++ b/R/subset.R @@ -48,6 +48,14 @@ j <- as.integer(j) } + if (is.logical(i)) { + i <- which(i) + } + + if (is.logical(j)) { + j <- which(j) + } + if (is.null(i)) { i <- seq_len(N) } @@ -71,69 +79,34 @@ } if (any(i < 1L) || any(i > N)) { - cli::cli_abort("The row indices must be all between 1 and {.arg N}.") + cli::cli_abort("The row indices must be all between 1 and {.arg {N}}.") } if (any(j < 1L) || any(j > N)) { - cli::cli_abort("The column indices must be all between 1 and {.arg N}.") + cli::cli_abort("The column indices must be all between 1 and {.arg {N}}.") } common_indices <- intersect(i, j) - i_diff <- setdiff(i, common_indices) - j_diff <- setdiff(j, common_indices) - - nc <- length(common_indices) - D <- numeric(length = nc * (nc - 1) / 2) - class(D) <- "dist" - if (nc > 0) { - D <- DiagonalSubsetter(x, common_indices) - attributes(D) <- attributes(x) - attr(D, "Labels") <- row_ids[common_indices] - attr(D, "Size") <- length(common_indices) - } - - R <- matrix(nrow = length(i_diff), ncol = length(j)) - if (length(i_diff) > 0) { - R <- OffDiagonalSubsetter(x, i_diff, j) - rownames(R) <- row_ids[i_diff] + if (length(common_indices) == 0) { + out <- OffDiagonalSubsetter(x, i, j) + rownames(out) <- row_ids[i] + colnames(out) <- row_ids[j] + return(out) } - colnames(R) <- row_ids[j] - C <- matrix(nrow = length(i), ncol = length(j_diff)) - if (length(j_diff) > 0) { - C <- OffDiagonalSubsetter(x, i, j_diff) - colnames(C) <- row_ids[j_diff] - } - rownames(C) <- row_ids[i] - - out <- list(D = D, R = R, C = C) - class(out) <- "subdist" - out -} + i_diff <- setdiff(i, common_indices) + j_diff <- setdiff(j, common_indices) + if (length(i_diff) > 0 || length(j_diff) > 0) + cli::cli_abort(c( + "The subset opertor only works if the row and column indices are either", + "all the same or all different." + )) -#' Coerce a `subdist` object to a matrix -#' -#' @description Coerce a `subdist` object to a matrix. -#' -#' @param x An object of class `subdist` as generated by [`[.dist`]. -#' @param ... Additional arguments passed to [`as.matrix`]. Currently ignored. -#' -#' @return A numeric matrix storing the pairwise distances between the requested -#' observations. -#' @export -#' -#' @examples -#' D <- stats::dist(iris[, 1:4]) -#' x <- D[1:3, 2:7] -#' as.matrix(x) -as.matrix.subdist <- function(x, ...) { - out <- matrix(0, nrow = nrow(x$C), ncol = ncol(x$R)) - rownames(out) <- rownames(x$C) - colnames(out) <- colnames(x$R) - D <- as.matrix(x$D) - out[rownames(out) %in% rownames(D), colnames(out) %in% colnames(D)] <- D - out[rownames(out) %in% rownames(x$C), !(colnames(out) %in% colnames(D))] <- x$C - out[!(rownames(out) %in% rownames(D)), colnames(out) %in% colnames(x$R)] <- x$R - out + D <- DiagonalSubsetter(x, common_indices) + attributes(D) <- attributes(x) + attr(D, "Labels") <- row_ids[common_indices] + attr(D, "Size") <- length(common_indices) + class(D) <- "dist" + D } diff --git a/man/as.matrix.subdist.Rd b/man/as.matrix.subdist.Rd deleted file mode 100644 index 6db228c..0000000 --- a/man/as.matrix.subdist.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/subset.R -\name{as.matrix.subdist} -\alias{as.matrix.subdist} -\title{Coerce a \code{subdist} object to a matrix} -\usage{ -\method{as.matrix}{subdist}(x, ...) -} -\arguments{ -\item{x}{An object of class \code{subdist} as generated by [\verb{[.dist}].} - -\item{...}{Additional arguments passed to \code{\link{as.matrix}}. Currently ignored.} -} -\value{ -A numeric matrix storing the pairwise distances between the requested -observations. -} -\description{ -Coerce a \code{subdist} object to a matrix. -} -\examples{ -D <- stats::dist(iris[, 1:4]) -x <- D[1:3, 2:7] -as.matrix(x) -} diff --git a/man/find_medoids.Rd b/man/find_medoids.Rd new file mode 100644 index 0000000..44e3fc4 --- /dev/null +++ b/man/find_medoids.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/medoids.R +\name{find_medoids} +\alias{find_medoids} +\title{Finds the medoids from a distance matrix} +\usage{ +find_medoids(D, memberships = NULL) +} +\arguments{ +\item{D}{An object of class \code{\link[stats:dist]{stats::dist}}.} + +\item{memberships}{A factor specifying the cluster memberships of the +objects.} +} +\value{ +A named integer vector specifying the indices of the medoids. +} +\description{ +This function finds the medoids from a distance matrix. The +medoid is the object that minimizes the sum of distances to all other +objects. This function takes advantage of the \strong{RcppParallel} package to +compute the medoids in parallel. +} +\examples{ +D <- stats::dist(iris[, 1:4]) +find_medoids(D) +memberships <- as.factor(rep(1:3, each = 50L)) +find_medoids(D, memberships) +} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index ef8adea..d5dfb13 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -11,6 +11,29 @@ Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif +// GetMedoid +unsigned int GetMedoid(const Rcpp::NumericVector& distanceValues); +RcppExport SEXP _distops_GetMedoid(SEXP distanceValuesSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const Rcpp::NumericVector& >::type distanceValues(distanceValuesSEXP); + rcpp_result_gen = Rcpp::wrap(GetMedoid(distanceValues)); + return rcpp_result_gen; +END_RCPP +} +// GetMedoids +Rcpp::IntegerVector GetMedoids(const Rcpp::NumericVector& distanceValues, const Rcpp::IntegerVector& groupingValues); +RcppExport SEXP _distops_GetMedoids(SEXP distanceValuesSEXP, SEXP groupingValuesSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const Rcpp::NumericVector& >::type distanceValues(distanceValuesSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type groupingValues(groupingValuesSEXP); + rcpp_result_gen = Rcpp::wrap(GetMedoids(distanceValues, groupingValues)); + return rcpp_result_gen; +END_RCPP +} // DiagonalSubsetter Rcpp::NumericVector DiagonalSubsetter(const Rcpp::NumericVector& x, const Rcpp::IntegerVector& indices); RcppExport SEXP _distops_DiagonalSubsetter(SEXP xSEXP, SEXP indicesSEXP) { @@ -38,6 +61,8 @@ END_RCPP } static const R_CallMethodDef CallEntries[] = { + {"_distops_GetMedoid", (DL_FUNC) &_distops_GetMedoid, 1}, + {"_distops_GetMedoids", (DL_FUNC) &_distops_GetMedoids, 2}, {"_distops_DiagonalSubsetter", (DL_FUNC) &_distops_DiagonalSubsetter, 2}, {"_distops_OffDiagonalSubsetter", (DL_FUNC) &_distops_OffDiagonalSubsetter, 3}, {NULL, NULL, 0} diff --git a/src/medoids.cpp b/src/medoids.cpp new file mode 100644 index 0000000..e935055 --- /dev/null +++ b/src/medoids.cpp @@ -0,0 +1,180 @@ +#include +#include +#include + +struct MedoidFinder : public RcppParallel::Worker +{ + // input distances + const RcppParallel::RVector m_DistanceValues; + + // medoid + unsigned int m_MedoidValue; + + // accumulated distance of the medoid + double m_SumOfDistances; + + // constructors + MedoidFinder(const Rcpp::NumericVector input) + : m_DistanceValues(input), + m_MedoidValue(0), + m_SumOfDistances(std::numeric_limits::infinity()) {} + MedoidFinder(const MedoidFinder& medoidFinder, RcppParallel::Split) + : m_DistanceValues(medoidFinder.m_DistanceValues), + m_MedoidValue(0), + m_SumOfDistances(std::numeric_limits::infinity()) {} + + // find medoid on just the element of the range I've been asked to + void operator()(std::size_t begin, std::size_t end) { + unsigned int numberOfDistanceValues = m_DistanceValues.size(); + unsigned int numberOfObservations = (1 + std::sqrt(1 + 8 * numberOfDistanceValues)) / 2; + + for (unsigned int observationIndex = begin; observationIndex < end; ++observationIndex) + { + double distanceSum = 0.0; + for (unsigned int otherObservationIndex = 0; otherObservationIndex < numberOfObservations; ++otherObservationIndex) + { + if (observationIndex == otherObservationIndex) + continue; + + unsigned int rowId = observationIndex + 1; + unsigned int columnId = otherObservationIndex + 1; + if (rowId > columnId) + { + unsigned int temp = rowId; + rowId = columnId; + columnId = temp; + } + unsigned int distanceIndex = numberOfObservations * (rowId - 1) - + rowId * (rowId - 1) / 2 + columnId - rowId - 1; + distanceSum += m_DistanceValues[distanceIndex]; + } + + if (distanceSum < m_SumOfDistances) + { + m_SumOfDistances = distanceSum; + m_MedoidValue = observationIndex + 1; + } + } + } + + // join my values with that of another MedoidFinder + void join(const MedoidFinder& rhs) { + if (rhs.m_SumOfDistances < m_SumOfDistances) + { + m_SumOfDistances = rhs.m_SumOfDistances; + m_MedoidValue = rhs.m_MedoidValue; + } + } +}; + +// [[Rcpp::export]] +unsigned int GetMedoid(const Rcpp::NumericVector& distanceValues) +{ + MedoidFinder medoidFinder(distanceValues); + RcppParallel::parallelReduce(0, distanceValues.attr("Size"), medoidFinder); + return medoidFinder.m_MedoidValue; +} + +struct MultipleMedoidFinder : public RcppParallel::Worker +{ + // input distances + const RcppParallel::RVector m_DistanceValues; + + // input groups + const RcppParallel::RVector m_GroupingValues; + + // input group ID + const unsigned int m_GroupId; + + // medoid + unsigned int m_MedoidValue; + + // accumulated distance of the medoid + double m_SumOfDistances; + + // constructors + MultipleMedoidFinder(const Rcpp::NumericVector& input, + const Rcpp::IntegerVector& grouping, + const unsigned int groupId) + : m_DistanceValues(input), + m_GroupingValues(grouping), + m_GroupId(groupId), + m_MedoidValue(0), + m_SumOfDistances(std::numeric_limits::infinity()) {} + MultipleMedoidFinder(const MultipleMedoidFinder& medoidFinder, RcppParallel::Split) + : m_DistanceValues(medoidFinder.m_DistanceValues), + m_GroupingValues(medoidFinder.m_GroupingValues), + m_GroupId(medoidFinder.m_GroupId), + m_MedoidValue(0), + m_SumOfDistances(std::numeric_limits::infinity()) {} + + // find medoid on just the element of the range I've been asked to + void operator()(std::size_t begin, std::size_t end) { + unsigned int numberOfObservations = m_GroupingValues.size(); + + for (unsigned int observationIndex = begin; observationIndex < end; ++observationIndex) + { + unsigned int observationGroupId = m_GroupingValues[observationIndex]; + if (observationGroupId != m_GroupId) + continue; + + double distanceSum = 0.0; + for (unsigned int otherObservationIndex = 0; otherObservationIndex < numberOfObservations; ++otherObservationIndex) + { + unsigned int otherObservationGroupId = m_GroupingValues[otherObservationIndex]; + if (otherObservationGroupId != m_GroupId) + continue; + + if (observationIndex == otherObservationIndex) + continue; + + unsigned int rowId = observationIndex + 1; + unsigned int columnId = otherObservationIndex + 1; + if (rowId > columnId) + { + unsigned int temp = rowId; + rowId = columnId; + columnId = temp; + } + unsigned int distanceIndex = numberOfObservations * (rowId - 1) - + rowId * (rowId - 1) / 2 + columnId - rowId - 1; + distanceSum += m_DistanceValues[distanceIndex]; + } + + if (distanceSum < m_SumOfDistances) + { + m_SumOfDistances = distanceSum; + m_MedoidValue = observationIndex + 1; + } + } + } + + // join my values with that of another MedoidFinder + void join(const MultipleMedoidFinder& rhs) { + if (rhs.m_SumOfDistances < m_SumOfDistances) + { + m_SumOfDistances = rhs.m_SumOfDistances; + m_MedoidValue = rhs.m_MedoidValue; + } + } +}; + +// [[Rcpp::export]] +Rcpp::IntegerVector GetMedoids(const Rcpp::NumericVector& distanceValues, + const Rcpp::IntegerVector& groupingValues) +{ + Rcpp::IntegerVector groupIds = Rcpp::sort_unique(groupingValues); + unsigned int numberOfGroups = groupIds.size(); + unsigned int numberOfObservations = distanceValues.attr("Size"); + Rcpp::IntegerVector medoids(numberOfGroups); + + for (unsigned int groupIndex = 0; groupIndex < numberOfGroups; ++groupIndex) + { + unsigned int groupId = groupIds[groupIndex]; + MultipleMedoidFinder medoidFinder(distanceValues, groupingValues, groupId); + RcppParallel::parallelReduce(0, numberOfObservations, medoidFinder); + medoids[groupIndex] = medoidFinder.m_MedoidValue; + } + + return medoids; +} diff --git a/tests/testthat/_snaps/subset.md b/tests/testthat/_snaps/subset.md deleted file mode 100644 index ed04024..0000000 --- a/tests/testthat/_snaps/subset.md +++ /dev/null @@ -1,24 +0,0 @@ -# Subset operator works - - Code - Dsub - Output - $D - 2 3 - 3 0.3000000 - 4 0.3316625 0.2449490 - - $R - 2 3 4 5 6 7 - 1 0.5385165 0.509902 0.6480741 0.1414214 0.6164414 0.5196152 - - $C - 5 6 7 - 1 0.1414214 0.6164414 0.5196152 - 2 0.6082763 1.0908712 0.5099020 - 3 0.5099020 1.0862780 0.2645751 - 4 0.6480741 1.1661904 0.3316625 - - attr(,"class") - [1] "subdist" - diff --git a/tests/testthat/test-medoids.R b/tests/testthat/test-medoids.R new file mode 100644 index 0000000..97d7572 --- /dev/null +++ b/tests/testthat/test-medoids.R @@ -0,0 +1,13 @@ +test_that("find_medoids() works", { + D <- stats::dist(iris[, 1:4]) + expect_error(find_medoids(as.matrix(D))) + out1 <- find_medoids(D) + expect_equal(out1, 62L) + memberships <- rep(1:3, each = 50L) + expect_error(find_medoids(D, memberships)) + memberships <- as.factor(memberships) + out2 <- find_medoids(D, memberships) + expected <- c(8L, 97L, 113L) + names(expected) <- c("1", "2", "3") + expect_equal(out2, expected) +}) diff --git a/tests/testthat/test-subset.R b/tests/testthat/test-subset.R index 42ce04d..34c9a3c 100644 --- a/tests/testthat/test-subset.R +++ b/tests/testthat/test-subset.R @@ -1,10 +1,8 @@ test_that("Subset operator works", { D <- stats::dist(iris[, 1:4]) - Dsub <- D[1:4, 2:7] - expect_equal(length(Dsub), 3L) - expect_equal(names(Dsub), c("D", "R", "C")) - expect_equal(attr(Dsub$D, "Size"), 3L) - expect_equal(dim(Dsub$R), c(1L, 6L)) - expect_equal(dim(Dsub$C), c(4L, 3L)) - expect_snapshot(Dsub) + expect_error(D[1:4, 2:7]) + Dsub <- D[1:3, 1:3] + expect_equal(class(Dsub), "dist") + Dsub <- D[1:3, 4:6] + expect_equal(class(Dsub), c("matrix", "array")) })