diff --git a/R/RcppExports.R b/R/RcppExports.R index 57e983d5..6349692e 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,6 +1,10 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +fbm256_prod_and_rowSumsSq <- function(BM, ind_row, ind_col, center, scale, V) { + .Call(`_tidypopgen_fbm256_prod_and_rowSumsSq`, BM, ind_row, ind_col, center, scale, V) +} + gt_grouped_alt_freq_diploid <- function(BM, rowInd, colInd, groupIds, ngroups, ncores) { .Call(`_tidypopgen_gt_grouped_alt_freq_diploid`, BM, rowInd, colInd, groupIds, ngroups, ncores) } diff --git a/R/gt_pca_autoSVD.R b/R/gt_pca_autoSVD.R index 80e642dc..b6c3eadc 100644 --- a/R/gt_pca_autoSVD.R +++ b/R/gt_pca_autoSVD.R @@ -114,6 +114,8 @@ gt_pca_autoSVD <- function(x, k = 10, this_svd$method <- "autoSVD" this_svd$call <- match.call() + # subset the loci table to have only the snps of interest + this_svd$loci <- show_loci(x)[.gt_bigsnp_cols %in% attr(x,"subset"),] class(this_svd) <- c("gt_pca", class(this_svd)) this_svd } diff --git a/R/gt_pca_partialSVD.R b/R/gt_pca_partialSVD.R index 6902403c..37b5ecd1 100644 --- a/R/gt_pca_partialSVD.R +++ b/R/gt_pca_partialSVD.R @@ -54,6 +54,7 @@ gt_pca_partialSVD <- function(x, k = 10, fun_scaling = bigsnpr::snp_scaleBinom() rownames(this_svd$v) <- loci_names(x) this_svd$method <- "partialSVD" this_svd$call <- match.call() + this_svd$loci <- show_loci(x) class(this_svd) <- c("gt_pca", class(this_svd)) this_svd } diff --git a/R/gt_pca_randomSVD.R b/R/gt_pca_randomSVD.R index 53083e77..292f56b6 100644 --- a/R/gt_pca_randomSVD.R +++ b/R/gt_pca_randomSVD.R @@ -77,6 +77,7 @@ gt_pca_randomSVD <- function(x, k = 10, rownames(this_svd$v) <- loci_names(x) this_svd$method <- "randomSVD" this_svd$call <- match.call() + this_svd$loci <- show_loci(x) class(this_svd) <- c("gt_pca", class(this_svd)) this_svd } diff --git a/R/predict_gt_pca.R b/R/predict_gt_pca.R index 985477ad..778f4d6e 100644 --- a/R/predict_gt_pca.R +++ b/R/predict_gt_pca.R @@ -1,20 +1,37 @@ #' Predict scores of a PCA #' -#' Predict the PCA scores for a [`gt_pca`], either for the original data or for new data. +#' Predict the PCA scores for a [`gt_pca`], either for the original data or +#' projecting new data. #' #' @param object the [`gt_pca`] object #' @param new_data a gen_tibble if scores are requested for a new dataset +#' @param impute_to_center boolean on whether to impute missing values in +#' `new_data` to the mean values used +#' to center the pca. This option is used to e.g. project ancient data onto a PCA +#' fitted to modern data. It defaults to TRUE. +#' @param prediction_type a string taking the value of "simple" and/or OADP (Online Augmentation, Decomposition, and Procrustes (OADP) projection) #' @param block_size number of loci read simultaneously (larger values will speed up #' computation, but require more memory) +#' @param n_cores number of cores #' @param ... no used #' @returns a matrix of predictions, with samples as rows and components as columns. The number -#' of components depends on how many were estimated in the [`gt_pca`] object +#' of components depends on how many were estimated in the [`gt_pca`] object. If prediction +#' type is equal to c("simple","OADP"), then a list of two matrices is returned +#' @references Zhang et al (2020). Fast and robust ancestry prediction using +#' principal component analysis 36(11): 3439–3446. #' @rdname predict_gt_pca #' @export # this is a modified version of bigstatsr::predict.big_SVD -predict.gt_pca <- function(object, new_data=NULL,block_size = NULL, ...){ +predict.gt_pca <- function(object, new_data=NULL, impute_to_center = TRUE, + prediction_type = "simple", + block_size = NULL, + n_cores = 1, ...){ rlang::check_dots_empty() + if (!all(prediction_type %in% c("simple", "OADP"))){ + stop("prediction_type can only take values 'simple' or 'OADP'") + } + if (is.null(new_data)) { # U * D sweep(object$u, 2, object$d, '*') @@ -22,20 +39,90 @@ predict.gt_pca <- function(object, new_data=NULL,block_size = NULL, ...){ if (!inherits(new_data,"gen_tbl")){ stop ("new_data should be a gen_tibble") } - if (gt_has_imputed(new_data) && !gt_uses_imputed(new_data)){ - gt_set_imputed(new_data, set = TRUE) - on.exit(gt_set_imputed(new_data, set = FALSE)) + # check the new_data have the same loci as the dataset used to build the pca + if (!all(object$loci$name %in% show_loci(new_data)$name)){ + stop("loci used in object are not present in new_data") } - if (is.null(block_size)){ - block_size <- bigstatsr::block_size(nrow(new_data)) + # get id of loci in new_data + loci_subset <- match(object$loci$name, show_loci(new_data)$name) + if (!all(all(show_loci(new_data)$allele_ref[loci_subset]==object$loci$allele_ref), + all(show_loci(new_data)$allele_alt[loci_subset]==object$loci$allele_alt))){ + stop("ref and alt alleles differ between new_data and the data used to create the pca object") + } + + if (!impute_to_center){ + if (gt_has_imputed(new_data) && !gt_uses_imputed(new_data)){ + gt_set_imputed(new_data, set = TRUE) + on.exit(gt_set_imputed(new_data, set = FALSE)) + } + + if (is.null(block_size)){ + block_size <- bigstatsr::block_size(nrow(new_data)) + } + # X * V + XV <- bigstatsr::big_prodMat(.gt_get_bigsnp(new_data)$genotypes, + object$v, + ind.row = .gt_bigsnp_rows(new_data), + ind.col = .gt_bigsnp_cols(new_data)[loci_subset], + block.size = block_size, + center = object$center, + scale = object$scale) + # if we use OADP, then we need to compute Xnorm + if ("OADP" %in% prediction_type){ + stop ("OADP currently only implemented for when `impute_to_center = TRUE`") + } + } else { + + X_norm <- bigstatsr::FBM(nrow(new_data), 1, init = 0) + XV <- bigstatsr::FBM(nrow(new_data), ncol(object$u), init = 0) + + bigstatsr::big_parallelize( + .gt_get_bigsnp(new_data)$genotypes, + p.FUN = fbm256_part_prod, + ind = seq_along(loci_subset), + ncores = n_cores, + ind.row = .gt_bigsnp_rows(new_data), + ind.col = .gt_bigsnp_cols(new_data)[loci_subset], #info_snp$`_NUM_ID_`[keep], + center = object$center, + scale = object$scale, + V = object$v, + XV = XV, + X_norm = X_norm + ) + + if ("OADP" %in% prediction_type){ + oadp_proj <- utils::getFromNamespace("OADP_proj", "bigsnpr")(XV, X_norm, object$d, ncores = n_cores) + } } - # X * V - bigstatsr::big_prodMat(.gt_get_bigsnp(new_data)$genotypes, - object$v, - ind.row = .gt_bigsnp_rows(new_data), - ind.col = .gt_bigsnp_cols(new_data), - block.size = block_size, - center = object$center, - scale = object$scale) + if (all(c("simple","OADP") %in% prediction_type)){ + return( list( + simple_proj = XV[, , drop = FALSE], + OADP_proj = oadp_proj + )) + } else if ("simple" %in% prediction_type) { + return(XV[,, drop = FALSE]) + } else { + return (oadp_proj) + } + return(XV[,, drop = FALSE]) } } + +####################################################################################### +# a port of bigsnpr::part_prod to work on standard fb256 matrices + +fbm256_part_prod <- function(X, ind, ind.row, ind.col, center, scale, V, XV, X_norm) { + + res <- fbm256_prod_and_rowSumsSq( + BM = X, + ind_row = ind.row, + ind_col = ind.col[ind], + center = center[ind], + scale = scale[ind], + V = V[ind, , drop = FALSE] + ) + + bigstatsr::big_increment(XV, res[[1]], use_lock = TRUE) + bigstatsr::big_increment(X_norm, res[[2]], use_lock = TRUE) +} + diff --git a/man/predict_gt_pca.Rd b/man/predict_gt_pca.Rd index 6db339d7..67220986 100644 --- a/man/predict_gt_pca.Rd +++ b/man/predict_gt_pca.Rd @@ -4,22 +4,45 @@ \alias{predict.gt_pca} \title{Predict scores of a PCA} \usage{ -\method{predict}{gt_pca}(object, new_data = NULL, block_size = NULL, ...) +\method{predict}{gt_pca}( + object, + new_data = NULL, + impute_to_center = TRUE, + prediction_type = "simple", + block_size = NULL, + n_cores = 1, + ... +) } \arguments{ \item{object}{the \code{\link{gt_pca}} object} \item{new_data}{a gen_tibble if scores are requested for a new dataset} +\item{impute_to_center}{boolean on whether to impute missing values in +\code{new_data} to the mean values used +to center the pca. This option is used to e.g. project ancient data onto a PCA +fitted to modern data. It defaults to TRUE.} + +\item{prediction_type}{a string taking the value of "simple" and/or OADP (Online Augmentation, Decomposition, and Procrustes (OADP) projection)} + \item{block_size}{number of loci read simultaneously (larger values will speed up computation, but require more memory)} +\item{n_cores}{number of cores} + \item{...}{no used} } \value{ a matrix of predictions, with samples as rows and components as columns. The number -of components depends on how many were estimated in the \code{\link{gt_pca}} object +of components depends on how many were estimated in the \code{\link{gt_pca}} object. If prediction +type is equal to c("simple","OADP"), then a list of two matrices is returned } \description{ -Predict the PCA scores for a \code{\link{gt_pca}}, either for the original data or for new data. +Predict the PCA scores for a \code{\link{gt_pca}}, either for the original data or +projecting new data. +} +\references{ +Zhang et al (2020). Fast and robust ancestry prediction using +principal component analysis 36(11): 3439–3446. } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 2c788e71..d93049e3 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -11,6 +11,22 @@ Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif +// fbm256_prod_and_rowSumsSq +List fbm256_prod_and_rowSumsSq(Environment BM, const IntegerVector& ind_row, const IntegerVector& ind_col, const NumericVector& center, const NumericVector& scale, const NumericMatrix& V); +RcppExport SEXP _tidypopgen_fbm256_prod_and_rowSumsSq(SEXP BMSEXP, SEXP ind_rowSEXP, SEXP ind_colSEXP, SEXP centerSEXP, SEXP scaleSEXP, SEXP VSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Environment >::type BM(BMSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type ind_row(ind_rowSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type ind_col(ind_colSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type center(centerSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type scale(scaleSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type V(VSEXP); + rcpp_result_gen = Rcpp::wrap(fbm256_prod_and_rowSumsSq(BM, ind_row, ind_col, center, scale, V)); + return rcpp_result_gen; +END_RCPP +} // gt_grouped_alt_freq_diploid ListOf gt_grouped_alt_freq_diploid(Environment BM, const IntegerVector& rowInd, const IntegerVector& colInd, const IntegerVector& groupIds, int ngroups, int ncores); RcppExport SEXP _tidypopgen_gt_grouped_alt_freq_diploid(SEXP BMSEXP, SEXP rowIndSEXP, SEXP colIndSEXP, SEXP groupIdsSEXP, SEXP ngroupsSEXP, SEXP ncoresSEXP) { @@ -171,6 +187,7 @@ END_RCPP } static const R_CallMethodDef CallEntries[] = { + {"_tidypopgen_fbm256_prod_and_rowSumsSq", (DL_FUNC) &_tidypopgen_fbm256_prod_and_rowSumsSq, 6}, {"_tidypopgen_gt_grouped_alt_freq_diploid", (DL_FUNC) &_tidypopgen_gt_grouped_alt_freq_diploid, 6}, {"_tidypopgen_gt_grouped_alt_freq_pseudohap", (DL_FUNC) &_tidypopgen_gt_grouped_alt_freq_pseudohap, 7}, {"_tidypopgen_gt_grouped_missingness", (DL_FUNC) &_tidypopgen_gt_grouped_missingness, 6}, diff --git a/src/fbm_prod_and_rowSumSq.cpp b/src/fbm_prod_and_rowSumSq.cpp new file mode 100644 index 00000000..e98a9d03 --- /dev/null +++ b/src/fbm_prod_and_rowSumSq.cpp @@ -0,0 +1,48 @@ +/******************************************************************************/ + +#include + +/******************************************************************************/ + +// This is a port of prod_and_rowSumSq from bigsnpr, adapted to work on a standard FBM + +// [[Rcpp::export]] +List fbm256_prod_and_rowSumsSq(Environment BM, + const IntegerVector& ind_row, + const IntegerVector& ind_col, + const NumericVector& center, + const NumericVector& scale, + const NumericMatrix& V) { + + XPtr xpBM = BM["address"]; + SubBMCode256Acc macc(xpBM, ind_row, ind_col, BM["code256"], 1); + + + size_t n = macc.nrow(); //Number of individuals + size_t m = macc.ncol(); //Number of sites + myassert_size(m, V.rows()); //check number of sites same as number in V from PCA + size_t K = V.cols(); + size_t i, j, k; + + NumericMatrix XV(n, K); + NumericVector rowSumsSq(n); + + for (j = 0; j < m; j++) { + for (i = 0; i < n; i++) { + double x = macc(i, j); // here we need to center and standardise + if (x>-1){ + x = (x-center[j])/scale[j]; + } else { +// Rcout<<"impute"<% group_by(population) +# TODO I don't understand why this does not silence all messages +options("rlib_message_verbosity" = "quiet") + test_that("extract f2 correctly",{ # process the data with admixtools (note that we get some warnings) bed_file <- gt_as_plink(test_gt, file = tempfile("test_bed")) diff --git a/tests/testthat/test_gt_pca.R b/tests/testthat/test_gt_pca.R index cb784134..5c212d5d 100644 --- a/tests/testthat/test_gt_pca.R +++ b/tests/testthat/test_gt_pca.R @@ -5,8 +5,48 @@ test_that("fit_gt_pca_and_predict",{ "You can't have missing") missing_gt <- gt_impute_simple(missing_gt, method = "mode") missing_pca <- missing_gt %>% gt_pca_partialSVD() + # check that predicting on the object is the same as predicting from the full dataset + # without imputation to the center (the data are already imputed) expect_true(all.equal(predict(missing_pca), - predict(missing_pca, new_data = missing_gt), + predict(missing_pca, new_data = missing_gt, + impute_to_center = FALSE), check.attributes=FALSE)) + # now mismatch the loci table + missing_gt_edited <- missing_gt + show_loci(missing_gt_edited)$name[3] <- "blah" + expect_error(predict(missing_pca, new_data = missing_gt_edited), + "loci used in object") + missing_gt_edited <- missing_gt + show_loci(missing_gt_edited)$allele_ref[3] <- "blah" + expect_error(predict(missing_pca, new_data = missing_gt_edited), + "ref and alt alleles differ") + # predict when new dataset has extra positions + missing_gt_sub <- missing_gt %>% select_loci(100:450) + missing_sub_pca <- missing_gt_sub %>% gt_pca_partialSVD() + expect_true(all.equal(predict(missing_sub_pca), + predict(missing_sub_pca, new_data = missing_gt, + impute_to_center = FALSE), + check.attributes=FALSE)) + }) +# TODO we should test gt_pca_autoSVD(), as the loci have to be subset within +# the object + +test_that("fit_gt_pca_and_predict_splitted_data",{ + bed_file <- system.file("extdata", "example-missing.bed", package = "bigsnpr") + missing_gt <- gen_tibble(bed_file, backingfile = tempfile("missing_"),quiet=TRUE) + # create a fake ancient set by subsetting + ancient_gt <- missing_gt[1:20,] + # now extract the modern data (to be imputed) + modern_gt <- missing_gt[-c(1:20),] + + modern_gt <- gt_impute_simple(modern_gt, method = "mode") + modern_pca <- modern_gt %>% gt_pca_partialSVD() + # if we just try to predict, we find that the new data have missing data + ancient_pred <- predict(modern_pca, new_data = ancient_gt) + expect_true(all(dim(ancient_pred)==c(20,10))) + # now raise an error if we don't impute to the mean + expect_error(predict(modern_pca, new_data = ancient_gt, impute_to_center = FALSE), + "You can't have missing values in 'X'") +})