Skip to content

Commit

Permalink
Modified subsetting operator and adds find_medoids() function.
Browse files Browse the repository at this point in the history
  • Loading branch information
astamm committed Jan 19, 2024
1 parent 234b5ac commit d45f8ad
Show file tree
Hide file tree
Showing 12 changed files with 326 additions and 116 deletions.
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Generated by roxygen2: do not edit by hand

S3method("[",dist)
S3method(as.matrix,subdist)
export(find_medoids)
export(use_distance)
export(use_distops)
importFrom(Rcpp,sourceCpp)
Expand Down
11 changes: 7 additions & 4 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,16 @@ for package developers to define TBB-parallelized functions to compute pairwise
distance matrices using a custom C++-implemented distance function.
* Provides subset operator
[`[.dist`](https://lmjl-alea.github.io/distops/reference/sub-.dist.html) to
subset a distance matrix. The returned object is an object of class `subdist`
which can be turned into a matrix by
[`as.matrix.subdist()`](https://lmjl-alea.github.io/distops/reference/as.matrix_subdist.html).
subset a distance matrix. The returned object is either of class `dist` if both
row and column indices are identical or of class `matrix` if both row and column
indices are different.
* Provides function `find_medoids(D, memberships = NULL)` to find medoid(s) of a
distance matrix. If `memberships` is provided, one medoid per cluster is
returned. Otherwise, a single overall medoid is returned.

## ToDo List

* Pass a list instead of a matrix to be more general.
* Pass a list instead of a matrix to be more general?
* Use Arrow parquet format to store distance matrix in multiple files when
sample size exceeds 10,000 or something like that.
* Use Arrow connection to read in large data.
Expand Down
8 changes: 8 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

GetMedoid <- function(distanceValues) {
.Call(`_distops_GetMedoid`, distanceValues)
}

GetMedoids <- function(distanceValues, groupingValues) {
.Call(`_distops_GetMedoids`, distanceValues, groupingValues)
}

DiagonalSubsetter <- function(x, indices) {
.Call(`_distops_DiagonalSubsetter`, x, indices)
}
Expand Down
30 changes: 30 additions & 0 deletions R/medoids.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#' Finds the medoids from a distance matrix
#'
#' @description This function finds the medoids from a distance matrix. The
#' medoid is the object that minimizes the sum of distances to all other
#' objects. This function takes advantage of the **RcppParallel** package to
#' compute the medoids in parallel.
#'
#' @param D An object of class [`stats::dist`].
#' @param memberships A factor specifying the cluster memberships of the
#' objects.
#'
#' @return A named integer vector specifying the indices of the medoids.
#' @export
#'
#' @examples
#' D <- stats::dist(iris[, 1:4])
#' find_medoids(D)
#' memberships <- as.factor(rep(1:3, each = 50L))
#' find_medoids(D, memberships)
find_medoids <- function(D, memberships = NULL) {
if (!inherits(D, "dist"))
cli::cli_abort("The input argument {.arg D} must be of class {.cls dist}.")
if (is.null(memberships))
return(GetMedoid(D))
if (!is.factor(memberships))
cli::cli_abort("The input argument {.arg memberships} must be of class {.cls factor}.")
out <- GetMedoids(D, as.numeric(memberships))
names(out) <- levels(memberships)
out
}
83 changes: 28 additions & 55 deletions R/subset.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@
j <- as.integer(j)
}

if (is.logical(i)) {
i <- which(i)
}

if (is.logical(j)) {
j <- which(j)
}

if (is.null(i)) {
i <- seq_len(N)
}
Expand All @@ -71,69 +79,34 @@
}

if (any(i < 1L) || any(i > N)) {
cli::cli_abort("The row indices must be all between 1 and {.arg N}.")
cli::cli_abort("The row indices must be all between 1 and {.arg {N}}.")
}

if (any(j < 1L) || any(j > N)) {
cli::cli_abort("The column indices must be all between 1 and {.arg N}.")
cli::cli_abort("The column indices must be all between 1 and {.arg {N}}.")
}

common_indices <- intersect(i, j)
i_diff <- setdiff(i, common_indices)
j_diff <- setdiff(j, common_indices)

nc <- length(common_indices)
D <- numeric(length = nc * (nc - 1) / 2)
class(D) <- "dist"
if (nc > 0) {
D <- DiagonalSubsetter(x, common_indices)
attributes(D) <- attributes(x)
attr(D, "Labels") <- row_ids[common_indices]
attr(D, "Size") <- length(common_indices)
}

R <- matrix(nrow = length(i_diff), ncol = length(j))
if (length(i_diff) > 0) {
R <- OffDiagonalSubsetter(x, i_diff, j)
rownames(R) <- row_ids[i_diff]
if (length(common_indices) == 0) {
out <- OffDiagonalSubsetter(x, i, j)
rownames(out) <- row_ids[i]
colnames(out) <- row_ids[j]
return(out)
}
colnames(R) <- row_ids[j]

C <- matrix(nrow = length(i), ncol = length(j_diff))
if (length(j_diff) > 0) {
C <- OffDiagonalSubsetter(x, i, j_diff)
colnames(C) <- row_ids[j_diff]
}
rownames(C) <- row_ids[i]

out <- list(D = D, R = R, C = C)
class(out) <- "subdist"
out
}
i_diff <- setdiff(i, common_indices)
j_diff <- setdiff(j, common_indices)

if (length(i_diff) > 0 || length(j_diff) > 0)
cli::cli_abort(c(
"The subset opertor only works if the row and column indices are either",
"all the same or all different."
))

#' Coerce a `subdist` object to a matrix
#'
#' @description Coerce a `subdist` object to a matrix.
#'
#' @param x An object of class `subdist` as generated by [`[.dist`].
#' @param ... Additional arguments passed to [`as.matrix`]. Currently ignored.
#'
#' @return A numeric matrix storing the pairwise distances between the requested
#' observations.
#' @export
#'
#' @examples
#' D <- stats::dist(iris[, 1:4])
#' x <- D[1:3, 2:7]
#' as.matrix(x)
as.matrix.subdist <- function(x, ...) {
out <- matrix(0, nrow = nrow(x$C), ncol = ncol(x$R))
rownames(out) <- rownames(x$C)
colnames(out) <- colnames(x$R)
D <- as.matrix(x$D)
out[rownames(out) %in% rownames(D), colnames(out) %in% colnames(D)] <- D
out[rownames(out) %in% rownames(x$C), !(colnames(out) %in% colnames(D))] <- x$C
out[!(rownames(out) %in% rownames(D)), colnames(out) %in% colnames(x$R)] <- x$R
out
D <- DiagonalSubsetter(x, common_indices)
attributes(D) <- attributes(x)
attr(D, "Labels") <- row_ids[common_indices]
attr(D, "Size") <- length(common_indices)
class(D) <- "dist"
D
}
25 changes: 0 additions & 25 deletions man/as.matrix.subdist.Rd

This file was deleted.

29 changes: 29 additions & 0 deletions man/find_medoids.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 25 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,29 @@ Rcpp::Rostream<true>& Rcpp::Rcout = Rcpp::Rcpp_cout_get();
Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
#endif

// GetMedoid
unsigned int GetMedoid(const Rcpp::NumericVector& distanceValues);
RcppExport SEXP _distops_GetMedoid(SEXP distanceValuesSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< const Rcpp::NumericVector& >::type distanceValues(distanceValuesSEXP);
rcpp_result_gen = Rcpp::wrap(GetMedoid(distanceValues));
return rcpp_result_gen;
END_RCPP
}
// GetMedoids
Rcpp::IntegerVector GetMedoids(const Rcpp::NumericVector& distanceValues, const Rcpp::IntegerVector& groupingValues);
RcppExport SEXP _distops_GetMedoids(SEXP distanceValuesSEXP, SEXP groupingValuesSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< const Rcpp::NumericVector& >::type distanceValues(distanceValuesSEXP);
Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type groupingValues(groupingValuesSEXP);
rcpp_result_gen = Rcpp::wrap(GetMedoids(distanceValues, groupingValues));
return rcpp_result_gen;
END_RCPP
}
// DiagonalSubsetter
Rcpp::NumericVector DiagonalSubsetter(const Rcpp::NumericVector& x, const Rcpp::IntegerVector& indices);
RcppExport SEXP _distops_DiagonalSubsetter(SEXP xSEXP, SEXP indicesSEXP) {
Expand Down Expand Up @@ -38,6 +61,8 @@ END_RCPP
}

static const R_CallMethodDef CallEntries[] = {
{"_distops_GetMedoid", (DL_FUNC) &_distops_GetMedoid, 1},
{"_distops_GetMedoids", (DL_FUNC) &_distops_GetMedoids, 2},
{"_distops_DiagonalSubsetter", (DL_FUNC) &_distops_DiagonalSubsetter, 2},
{"_distops_OffDiagonalSubsetter", (DL_FUNC) &_distops_OffDiagonalSubsetter, 3},
{NULL, NULL, 0}
Expand Down
Loading

0 comments on commit d45f8ad

Please sign in to comment.