From 23646e10925a64d3d7361ec47085079bb546125a Mon Sep 17 00:00:00 2001 From: Sergey Kucheryavskiy Date: Thu, 8 Nov 2018 16:13:28 +0100 Subject: [PATCH 1/8] improvements to prep.autoscale(), fixes #58 Fixed a big with using max.cov and changed its default value to 0 --- R/prep.R | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/R/prep.R b/R/prep.R index d7d0996..7989d9a 100755 --- a/R/prep.R +++ b/R/prep.R @@ -10,13 +10,20 @@ #' @param scale #' a logical value or vector with numbers for weighting #' @param max.cov -#' columns that have coefficient of variation (in percent) below `max.cv` will not be scaled +#' columns that have coefficient of variation (in percent) below or equal to `max.cov` will not be scaled #' #' @return #' data matrix with processed values #' +#' @description +#' +#' The use of `max.cov` allows to avoid overestimation of inert variables, which vary +#' very little. Note, that the `max.cov` value is already in percent, e.g. if `max.cov = 0.1` it +#' will compare the coefficient of variation of every variable with 0.1% (not 1%). If you do not +#' want to use this option simply keep `max.cov = 0`. +#' #' @export -prep.autoscale = function(data, center = T, scale = F, max.cov = 0.1) { +prep.autoscale = function(data, center = T, scale = F, max.cov = 0) { attrs = mda.getattr(data) dimnames = dimnames(data) @@ -33,16 +40,18 @@ prep.autoscale = function(data, center = T, scale = F, max.cov = 0.1) { else if(is.numeric(scale)) scale = scale - # make autoscaling and attach preprocessing attributes - + # compute coefficient of variation and set scale to 1 if it is below + # a user defined threshold if(is.numeric(scale)) { if (!is.numeric(center)) m = apply(data, 2, mean) else m = center cv = scale/m * 100 - scale[is.nan(cv) | cv < 0.1] = 1 + scale[is.nan(cv) | cv <= max.cov] = 1 } + + # make autoscaling and attach preprocessing attributes data = scale(data, center = center, scale = scale) data = mda.setattr(data, attrs) @@ -51,6 +60,7 @@ prep.autoscale = function(data, center = T, scale = F, max.cov = 0.1) { attr(data, 'prep:center') = center attr(data, 'prep:scale') = scale dimnames(data) = dimnames + data } From 78d58e688e929ae351ea7b0e67702b6868eda4ed Mon Sep 17 00:00:00 2001 From: Sergey Kucheryavskiy Date: Thu, 8 Nov 2018 16:14:43 +0100 Subject: [PATCH 2/8] added @export to classres to make possible using classless with other methods outside mdatools package --- R/classres.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/classres.R b/R/classres.R index 8501969..53025e2 100755 --- a/R/classres.R +++ b/R/classres.R @@ -43,6 +43,8 @@ #' \code{\link{plotPerformance.classres}} \tab makes plot with misclassified ration, specificity #' and sensitivity values.\cr #' } +#' +#' @export classres = function(c.pred, c.ref = NULL, p.pred = NULL, ncomp.selected = NULL) { if (!is.null(c.ref)) { attrs = mda.getattr(c.ref) From 1502af88aae851575ccf5e7a9fd2c82a74a44571 Mon Sep 17 00:00:00 2001 From: Sergey Kucheryavskiy Date: Fri, 9 Nov 2018 09:44:02 +0100 Subject: [PATCH 3/8] added tests for preprocessing --- tests/testthat/test_prep.R | 45 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 tests/testthat/test_prep.R diff --git a/tests/testthat/test_prep.R b/tests/testthat/test_prep.R new file mode 100644 index 0000000..ac3ff60 --- /dev/null +++ b/tests/testthat/test_prep.R @@ -0,0 +1,45 @@ +# new tests on top + +context('Preprocessing: autoscale') + +data(simdata) + +# normal spectra +X1 = simdata$spectra.c +p11 = prep.autoscale(X1) +p12 = prep.autoscale(X1, center = T, scale = F) +p13 = prep.autoscale(X1, center = T, scale = T) + +# spectra with constant and small variation variables +X2 = simdata$spectra.c +X2[, 20:50] = matrix(apply(X2[, 20:50], 2, mean), nrow = nrow(X2), ncol = 31, byrow = T) # constant +X2[, 51:60] = X2[, 51:60] * 0.001 + 0.1 # small CV +p21 = prep.autoscale(X2) +p22 = prep.autoscale(X2, center = T, scale = F) +p23 = prep.autoscale(X2, center = T, scale = T) +p24 = prep.autoscale(X2, center = T, scale = T, max.cov = 1) + +test_that("Default autoscaling is correct", { + expect_equal(attr(p11, 'prep:center'), apply(X1, 2, mean)) + expect_equal(attr(p11, 'prep:scale'), FALSE) + expect_equal(p11, p12) + expect_equal(p11, scale(X1, center = T, scale = F), check.attributes = FALSE) +}) + +test_that("Full autoscaling is correct", { + expect_equal(attr(p13, 'prep:center'), apply(X1, 2, mean)) + expect_equal(attr(p13, 'prep:scale'), apply(X1, 2, sd)) + expect_equal(p13, scale(X1, center = T, scale = T), check.attributes = FALSE) +}) + +test_that("Default autoscaling for data with constant variables is correct", { + expect_equal(attr(p21, 'prep:center'), apply(X2, 2, mean)) + expect_equal(attr(p21, 'prep:scale'), FALSE) + expect_equal(p21, p22) + expect_equal(p21, scale(X2, center = T, scale = F), check.attributes = FALSE) +}) + +test_that("Full autoscaling for data with constant variables is correct", { + expect_equal(attr(p23, 'prep:center'), apply(X1, 2, mean)) + expect_equal(attr(p23, 'prep:scale')[-(20:50)], apply(X1, 2, sd)[-(20:50)]) +}) From f2140ad6cb57a14a78d19dea392a6445e79927ec Mon Sep 17 00:00:00 2001 From: Sergey Kucheryavskiy Date: Fri, 9 Nov 2018 09:45:00 +0100 Subject: [PATCH 4/8] small changes --- NEWS.md | 3 +++ R/crossval.R | 4 ++-- R/mdaplots.R | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index a8bfaa4..29b30b8 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,6 @@ +v.0.9.2 +======= + v.0.9.1 ======= * all plot functions have new `opacity` parameter for semi-transparent colors diff --git a/R/crossval.R b/R/crossval.R index ea63efe..6a19d26 100755 --- a/R/crossval.R +++ b/R/crossval.R @@ -16,8 +16,8 @@ #' @return #' matrix with object indices for each segment #' -crossval = function(nobj, cv = NULL) -{ +#' @export +crossval = function(nobj, cv = NULL) { methods = c('rand', 'ven', 'loo') if (is.null(cv)) diff --git a/R/mdaplots.R b/R/mdaplots.R index a8598f4..d530745 100755 --- a/R/mdaplots.R +++ b/R/mdaplots.R @@ -1216,6 +1216,7 @@ mdaplot = function(data = NULL, plot.data = NULL, type = 'p', pch = 16, col = NU # show colorbar if needed if (!is.null(cgroup) && show.colorbar == T) mdaplot.showColorbar(cgroup, colmap, lab.col = lab.col, lab.cex = lab.cex) + } #' Plotting function for several sets of objects From 58d2cad4a3f386d8bd23cf212a8eb2de41a9a135 Mon Sep 17 00:00:00 2001 From: Sergey Kucheryavskiy Date: Fri, 9 Nov 2018 12:30:37 +0100 Subject: [PATCH 5/8] improvements to selectCompNum.pls() method, closes #57 --- R/defaults.R | 4 +- R/pls.R | 143 ++++++++++++++++++++++++++------------------------- 2 files changed, 77 insertions(+), 70 deletions(-) diff --git a/R/defaults.R b/R/defaults.R index af3fa1d..736f96b 100755 --- a/R/defaults.R +++ b/R/defaults.R @@ -159,9 +159,11 @@ getSelectivityRatio = function(obj, ...) { #' a model object #' @param ncomp #' number of components to select +#' @param ... +#' other arguments #' #' @export -selectCompNum = function(model, ncomp) { +selectCompNum = function(model, ncomp = NULL, ...) { UseMethod("selectCompNum") } diff --git a/R/pls.R b/R/pls.R index 82fd94e..46220cd 100755 --- a/R/pls.R +++ b/R/pls.R @@ -50,11 +50,14 @@ #' \item{weights }{matrix with PLS weights.} #' \item{selratio }{array with selectivity ratio values.} #' \item{vipscores }{matrix with VIP scores values.} -#' \item{coeffs }{object of class \code{\link{regcoeffs}} with regression coefficients calculated for each component.} +#' \item{coeffs }{object of class \code{\link{regcoeffs}} with regression coefficients calculated +#' for each component.} #' \item{info }{information about the model, provided by user when build the model.} #' \item{calres }{an object of class \code{\link{plsres}} with PLS results for a calibration data.} -#' \item{testres }{an object of class \code{\link{plsres}} with PLS results for a test data, if it was provided.} -#' \item{cvres }{an object of class \code{\link{plsres}} with PLS results for cross-validation, if this option was chosen.} +#' \item{testres }{an object of class \code{\link{plsres}} with PLS results for a test data, if it +#' was provided.} +#' \item{cvres }{an object of class \code{\link{plsres}} with PLS results for cross-validation, if +#' this option was chosen.} #' #' @details #' So far only SIMPLS method [1] is available, more coming soon. Implementation works both with one @@ -235,9 +238,10 @@ pls = function(x, y, ncomp = 15, center = T, scale = F, cv = NULL, exclcols = NU ncomp.selcrit = 'min') { # build a model and apply to calibration set - model = pls.cal(x, y, ncomp, center = center, scale = scale, method = method, coeffs.ci = coeffs.ci, - coeffs.alpha = coeffs.alpha, info = info, light = light, alpha = alpha, cv = cv, - exclcols = exclcols, exclrows = exclrows, ncomp.selcrit = ncomp.selcrit) + model = pls.cal(x, y, ncomp, center = center, scale = scale, method = method, + coeffs.ci = coeffs.ci,coeffs.alpha = coeffs.alpha, info = info, light = light, + alpha = alpha, cv = cv, exclcols = exclcols, exclrows = exclrows, + ncomp.selcrit = ncomp.selcrit) # do test set validation if provided if (!is.null(x.test) && !is.null(y.test)){ @@ -246,7 +250,6 @@ pls = function(x, y, ncomp = 15, center = T, scale = F, cv = NULL, exclcols = NU # select optimal number of components model = selectCompNum(model) - model } @@ -291,8 +294,8 @@ pls = function(x, y, ncomp = 15, center = T, scale = F, cv = NULL, exclcols = NU #' @return model #' an object with calibrated PLS model #' -pls.cal = function(x, y, ncomp, center, scale, method, cv, alpha, coeffs.ci, coeffs.alpha, info, light, - exclcols = NULL, exclrows = NULL, ncomp.selcrit) { +pls.cal = function(x, y, ncomp, center, scale, method, cv, alpha, coeffs.ci, coeffs.alpha, info, + light, exclcols = NULL, exclrows = NULL, ncomp.selcrit) { # prepare empty list for model object model = list() @@ -326,7 +329,6 @@ pls.cal = function(x, y, ncomp, center, scale, method, cv, alpha, coeffs.ci, coe y = mda.df2mat(y) x.nrows = nrow(x) x.ncols = ncol(x) - y.nrows = nrow(y) y.ncols = ncol(y) # check if data has missing values @@ -355,7 +357,8 @@ pls.cal = function(x, y, ncomp, center, scale, method, cv, alpha, coeffs.ci, coe y.cal = y # check excluded rows - #if (length(x.attrs$exclrows) != length(y.attrs$exclrows) || any(x.attrs$exclrows != y.attrs$exclrows)) + #if (length(x.attrs$exclrows) != length(y.attrs$exclrows) || + # any(x.attrs$exclrows != y.attrs$exclrows)) # stop('Excluded rows in response and predictors matrices are not the same!') # remove excluded rows @@ -429,9 +432,12 @@ pls.cal = function(x, y, ncomp, center, scale, method, cv, alpha, coeffs.ci, coe rownames(xloadings) = rownames(weights) = colnames(x) colnames(xloadings) = colnames(weights) = paste('Comp', 1:ncomp) attr(xloadings, 'name') = 'X loadings' - attr(xloadings, 'xaxis.name') = attr(weights, 'xaxis.name') = attr(coeffs, 'xaxis.name') = 'Components' - attr(xloadings, 'yaxis.name') = attr(weights, 'yaxis.name') = attr(coeffs, 'yaxis.name') = x.attrs$xaxis.name - attr(xloadings, 'yaxis.values') = attr(weights, 'yaxis.values') = attr(coeffs, 'yaxis.values') = x.attrs$xaxis.values + attr(xloadings, 'xaxis.name') = attr(weights, 'xaxis.name') = + attr(coeffs, 'xaxis.name') = 'Components' + attr(xloadings, 'yaxis.name') = attr(weights, 'yaxis.name') = + attr(coeffs, 'yaxis.name') = x.attrs$xaxis.name + attr(xloadings, 'yaxis.values') = attr(weights, 'yaxis.values') = + attr(coeffs, 'yaxis.values') = x.attrs$xaxis.values # do the same for response related results yloadings = matrix(0, nrow = y.ncols, ncol = ncomp) @@ -479,7 +485,8 @@ pls.cal = function(x, y, ncomp, center, scale, method, cv, alpha, coeffs.ci, coe # do cross-validation if needed if (!is.null(cv)) { - res = pls.crossval(model, x, y, cv, center = center, scale = scale, method = method, jack.knife = jack.knife) + res = pls.crossval(model, x, y, cv, center = center, scale = scale, method = method, + jack.knife = jack.knife) if (jack.knife == T) { model$coeffs = regcoeffs(model$coeffs$values, res$jkcoeffs) res[['jkcoeffs']] = NULL @@ -513,7 +520,7 @@ pls.cal = function(x, y, ncomp, center, scale, method, cv, alpha, coeffs.ci, coe #' @export pls.run = function(x, y, ncomp, method, cv) { if (method == 'simpls') - res = pls.simpls(x, y, ncomp, cv = cv) + return(pls.simpls(x, y, ncomp, cv = cv)) else stop('Method with this name is not supported!') } @@ -544,12 +551,6 @@ pls.simpls = function(x, y, ncomp, cv = FALSE) { x = as.matrix(x) y = as.matrix(y) - # get names for objects, variables and components - objnames = rownames(x); - prednames = colnames(x); - respnames = colnames(y); - - nobj = nrow(x) npred = ncol(x) nresp = ncol(y) @@ -673,7 +674,6 @@ pls.crossval = function(model, x, y, cv, center, scale, method, jack.knife = T) # get matrix with indices for cv segments idx = crossval(nobj, cv) - seglen = ncol(idx); nseg = nrow(idx); nrep = dim(idx)[3] @@ -712,8 +712,7 @@ pls.crossval = function(model, x, y, cv, center, scale, method, jack.knife = T) # get scores xscores = xt %*% (m$weights %*% solve(crossprod(m$xloadings, m$weights))) - yscores = as.matrix(yt) %*% m$yloadings - + # make predictions yp = apply(m$coeffs, 3, function(x, y)(y %*% x), xt) dim(yp) = c(nrow(xt), ncomp, ncol(yt)) @@ -731,10 +730,10 @@ pls.crossval = function(model, x, y, cv, center, scale, method, jack.knife = T) yp = sweep(yp, 3, m$ycenter, '+') # get distances - xdist = ldecomp.getDistances(scores = xscores, loadings = m$xloadings, residuals = xresiduals, - tnorm = model$xtnorm) - ydist = ldecomp.getDistances(scores = xscores, loadings = m$yloadings, residuals = yresiduals, - tnorm = model$ytnorm) + xdist = ldecomp.getDistances(scores = xscores, loadings = m$xloadings, + residuals = xresiduals, tnorm = model$xtnorm) + ydist = ldecomp.getDistances(scores = xscores, loadings = m$yloadings, + residuals = yresiduals, tnorm = model$ytnorm) # correct dimenstion for reg coeffs for JK dim(m$coeffs) = c(dim(m$coeffs), 1) @@ -759,7 +758,8 @@ pls.crossval = function(model, x, y, cv, center, scale, method, jack.knife = T) jkcoeffs = jkcoeffs / nrep # set up names - dimnames(jkcoeffs) = list(colnames(x), colnames(model$coeffs$values), colnames(model$calres$y.ref), 1:nseg) + dimnames(jkcoeffs) = list(colnames(x), colnames(model$coeffs$values), + colnames(model$calres$y.ref), 1:nseg) dimnames(yp.cv) = list(rownames(x), colnames(model$coeffs$values), colnames(model$calres$y.ref)) # compute variance @@ -776,9 +776,11 @@ pls.crossval = function(model, x, y, cv, center, scale, method, jack.knife = T) # make pls results and return res = plsres(yp.cv, y.ref = y, ncomp.selected = model$ncomp.selected, - xdecomp = ldecomp(ncomp.selected = model$ncomp.selected, dist = list(Q = Qx, T2 = T2x), + xdecomp = ldecomp(ncomp.selected = model$ncomp.selected, + dist = list(Q = Qx, T2 = T2x), var = varx, loadings = model$xloadings, attrs = x.attrs), - ydecomp = ldecomp(ncomp.selected = model$ncomp.selected, dist = list(Q = Qy, T2 = T2y), + ydecomp = ldecomp(ncomp.selected = model$ncomp.selected, + dist = list(Q = Qy, T2 = T2y), var = vary, loadings = model$yloadings, attrs = y.attrs) ) if (jack.knife == T) @@ -796,57 +798,57 @@ pls.crossval = function(model, x, y, cv, center, scale, method, jack.knife = T) #' PLS model (object of class \code{pls}) #' @param ncomp #' number of components to select +#' @param selcrit +#' criterion for selecting optimal number of components (\code{'min'} for +#' first local minimum of RMSECV and \code{'wold'} for Wold's rule.) #' #' @return #' the same model with selected number of components #' #' @details -#' If number of components is not specified, the Wold's R criterion is used. +#' If number of components is not specified, the cross-validation statistics will be used. It can +#' be either first local minimum of RMSECV (`selcrit='min'`) or Wold's rule (`selcrit='wold'`) +#' based on ratio of PRESS values and threshold of 0.95. +#' #' See examples in help for \code{\link{pls}} function. #' #' @export -selectCompNum.pls = function(model, ncomp = NULL) -{ +selectCompNum.pls = function(model, ncomp = NULL, selcrit = model$ncomp.selcrit) { if (!is.null(ncomp)) { # user defined number of components if (ncomp > model$ncomp || ncomp < 0) stop('Wrong number of selected components!') } else { # automatic estimation of the optimal number of components - n = dim(model$coeffs$values)[3] + # calculate PRESS based on RMSE if (!is.null(model$cvres)) - press = (model$cvres$rmse * n)^2 + press = model$cvres$rmse^2 * nrow(model$cvres$y.pred) else if (!is.null(model$testres)) - press = (model$testres$rmse * n)^2 + press = model$testres$rmse^2 * nrow(model$testres$y.pred) else { - press = (model$calres$rmse * n)^2 + press = model$calres$rmse^2 * nrow(model$calres$ypred) warning('No validation results were found!') } - - if (model$ncomp.selcrit == 'wold') { + + ncomp = model$ncomp + if (selcrit == 'wold') { # using Wold's criterion - ncomp = ncol(press) - if (ncomp > 2) - { + if (ncomp > 2) { r = press[, 2:ncomp] / press[, 1:(ncomp - 1)] ind = which(r > 0.95, arr.ind = TRUE) - if (length(ind) == 0) - ncomp = 1 - else if (is.null(dim(ind))) + if (is.null(dim(ind))) ncomp = min(ind) else ncomp = min(ind[, 2]) } else { ncomp = which.min(press) } - } else if (model$ncomp.selcrit == 'min') { + } else if (selcrit == 'min') { # using first local minimum df = diff(as.vector(press)) > 0 if (any(df)) ncomp = which(df)[1] - else - ncomp = length(press) } else { stop('Wrong value for "ncomp.selcrit" argument!') } @@ -907,7 +909,6 @@ predict.pls = function(object, x, y = NULL, ...) { # get dimensions nresp = dim(object$coeffs$values)[3] - nobj = nrow(x) ncomp = object$ncomp # check dimensions @@ -916,7 +917,7 @@ predict.pls = function(object, x, y = NULL, ...) { if (!is.null(y)) { if (nrow(x) != nrow(y)) - stop('Matrices with predictors (x) and response values (y) should have the same number of rows!') + stop('Matrices with predictors (x) and response (y) should have the same number of rows!') if (ncol(y) != nresp) stop('Wrong number of columns in matrix with response values (y)!') } @@ -927,9 +928,7 @@ predict.pls = function(object, x, y = NULL, ...) { # compute x scores and residuals xscores = x %*% (object$weights %*% solve(crossprod(object$xloadings, object$weights))) xresiduals = x - tcrossprod(xscores, object$xloadings) - xdist = ldecomp.getDistances(xscores, object$xloadings, xresiduals, object$xtnorm) - - + # make predictions yp = apply(object$coeffs$values, 3, function(x, y)(y %*% x), x) dim(yp) = c(nrow(x), ncomp, dim(object$coeffs$values)[3]) @@ -973,7 +972,8 @@ predict.pls = function(object, x, y = NULL, ...) { yp = sweep(yp, 3, object$ycenter, '+') # set up all attributes and names - dimnames(yp) = list(rownames(x), dimnames(object$coeffs$values)[[2]], dimnames(object$coeffs$values)[[3]]) + dimnames(yp) = list(rownames(x), dimnames(object$coeffs$values)[[2]], + dimnames(object$coeffs$values)[[3]]) yp = mda.setattr(yp, y.attrs) attr(yp, 'name') = 'Response values, predicted' @@ -991,8 +991,10 @@ predict.pls = function(object, x, y = NULL, ...) { xtotvar = sum(x^2) # create xdecomp object - xdecomp = ldecomp(scores = xscores, residuals = xresiduals, loadings = object$xloadings, attrs = x.attrs, - ncomp.selected = object$ncomp.selected, tnorm = object$xtnorm, totvar = xtotvar) + xdecomp = ldecomp(scores = xscores, residuals = xresiduals, loadings = object$xloadings, + attrs = x.attrs, + ncomp.selected = object$ncomp.selected, + tnorm = object$xtnorm, totvar = xtotvar) xdecomp$totvar = xtotvar res = plsres(yp, y.ref = y.ref, ncomp.selected = object$ncomp.selected, @@ -1203,7 +1205,7 @@ getSelectivityRatio.pls = function(obj, ncomp = NULL, ny = 1, ...) { #' #' @export getVIPScores.pls = function(obj, ny = 1, ...) { - vipscores = mda.subset(obj$vipscores, select = ny) + return(mda.subset(obj$vipscores, select = ny)) } @@ -1220,7 +1222,8 @@ getVIPScores.pls = function(obj, ny = 1, ...) { #' @param ny #' if y is multivariate which variables you want to see the coefficients for #' @param full -#' if TRUE the method also shows p-values and t-values as well as confidence intervals for the coefficients (if available) +#' if TRUE the method also shows p-values and t-values as well as confidence intervals for the +#' coefficients (if available) #' @param alpha #' significance level for confidence intervals (a number between 0 and 1, e.g. for 95\% alpha = 0.05) #' @param ... @@ -1243,7 +1246,8 @@ getVIPScores.pls = function(obj, ny = 1, ...) { #' A matrix with regression coefficients and (optinally) statistics. #' #' @export -getRegcoeffs.pls = function(obj, ncomp = NULL, ny = NULL, full = FALSE, alpha = obj$coeffs.alpha, ...) { +getRegcoeffs.pls = function(obj, ncomp = NULL, ny = NULL, full = FALSE, + alpha = obj$coeffs.alpha, ...) { if (is.null(ncomp)) ncomp = obj$ncomp.selected else if (length(ncomp) != 1 || ncomp <= 0 || ncomp > obj$ncomp) @@ -1425,8 +1429,7 @@ plotSelectivityRatio.pls = function(obj, ncomp = NULL, ny = 1, type = 'l', #' @export plotRMSE.pls = function(obj, ny = 1, type = 'b', main = 'RMSE', xlab = 'Components', ylab = NULL, labels = 'values', ...) { - ncomp = obj$ncomp - + if (is.null(ylab)) { if (nrow(obj$calres$rmse) == 1) ylab = 'RMSE' @@ -1597,8 +1600,6 @@ plotYCumVariance.pls = function(obj, type = 'b', plotVariance.pls = function(obj, decomp = 'xdecomp', variance = 'expvar', type = 'b', main = 'X variance', xlab = 'Components', ylab = 'Explained variance, %', labels = 'values', ...) { - ncomp = obj$ncomp - data = list() data$cal = obj$calres[[decomp]][[variance]] @@ -1851,7 +1852,8 @@ plotYResiduals.pls = function(obj, ncomp = NULL, ny = 1, main = NULL, show.line if (!is.null(obj$testres)) { attr = mda.getattr(obj$testres$y.pred) - data$test = cbind(obj$testres$y.ref[, ny], obj$testres$y.ref[, ny] - obj$testres$y.pred[, ncomp, ny]) + data$test = cbind(obj$testres$y.ref[, ny], + obj$testres$y.ref[, ny] - obj$testres$y.pred[, ncomp, ny]) data$test = mda.setattr(data$test, attr) colnames(data$test) = c(xaxis.name, 'Residuals') rownames(data$test) = rownames(obj$testres$y.pred) @@ -1908,7 +1910,8 @@ plotRegcoeffs.pls = function(obj, ncomp = NULL, ...) { #' See examples in help for \code{\link{pls}} function. #' #' @export -plotXLoadings.pls = function(obj, comp = c(1, 2), type = 'p', main = 'X loadings', show.axes = T, ...) { +plotXLoadings.pls = function(obj, comp = c(1, 2), type = 'p', main = 'X loadings', + show.axes = T, ...) { ncomp = length(comp) if (min(comp) < 1 || max(comp) > obj$ncomp) @@ -1928,7 +1931,8 @@ plotXLoadings.pls = function(obj, comp = c(1, 2), type = 'p', main = 'X loadings data = mda.subset(obj$xloadings, select = comp) if (type == 'p') { - colnames(data) = paste('Comp ', comp, ' (', round(obj$calres$xdecomp$expvar[comp], 2) , '%)', sep = '') + colnames(data) = paste('Comp ', comp, ' (', round(obj$calres$xdecomp$expvar[comp], 2) , '%)', + sep = '') mdaplot(data, main = main, type = type, show.lines = show.lines, ...) } else { data = mda.t(data) @@ -2025,7 +2029,8 @@ plotXResiduals.pls = function(obj, ncomp = NULL, main = NULL, xlab = 'T2', if (!is.null(obj$testres)) { attr = mda.getattr(obj$testres$xdecomp$Q) - data$test = cbind(obj$testres$xdecomp$T2[, ncomp, drop = F], obj$testres$xdecomp$Q[, ncomp, drop = F]) + data$test = cbind(obj$testres$xdecomp$T2[, ncomp, drop = F], + obj$testres$xdecomp$Q[, ncomp, drop = F]) data$test = mda.setattr(data$test, attr) rownames(data$test) = rownames(obj$testres$xdecomp$Q) } From 1f43b1fa3dbc6e324c67fe4c658d29a00df01601 Mon Sep 17 00:00:00 2001 From: Sergey Kucheryavskiy Date: Fri, 9 Nov 2018 14:16:53 +0100 Subject: [PATCH 6/8] several improvements to ipls() method and fix for #56 --- R/ipls.R | 101 ++++++++++++++++++------------------------------------- 1 file changed, 33 insertions(+), 68 deletions(-) diff --git a/R/ipls.R b/R/ipls.R index c0dbb78..825b8e0 100644 --- a/R/ipls.R +++ b/R/ipls.R @@ -100,9 +100,9 @@ #' #' @export ipls = function(x, y, glob.ncomp = 10, center = T, scale = F, cv = 10, - exclcols = NULL, exclrows = NULL, int.ncomp = 10, int.num = NULL, int.width = NULL, - int.limits = NULL, int.niter = NULL, ncomp.selcrit = 'min', method = 'forward', - silent = F) { + exclcols = NULL, exclrows = NULL, int.ncomp = glob.ncomp, int.num = NULL, + int.width = NULL, int.limits = NULL, int.niter = NULL, ncomp.selcrit = 'min', + method = 'forward', silent = F) { # process names and values for xaxis xaxis.name = attr(x, 'xaxis.name') @@ -325,12 +325,12 @@ ipls.forward = function(x, y, obj) { # do loop for max number of intervals selind = NULL + rmse = Inf for (i in 1:obj$int.niter) { if (!obj$silent) cat(sprintf('Iteration %3d/%3d... ', i, obj$int.niter)) sel = NULL - rmse = 99999999999999 for (l in int.nonselected) { # combine already selected intervals with the current ind = obj$int.limits[l, 1]:obj$int.limits[l, 2] @@ -361,15 +361,15 @@ ipls.forward = function(x, y, obj) { # obj$int.limits[l, 1],obj$int.limits[l, 2])) # else check if rmse has been improved - if (rmse - m$cvres$rmse[1, m$ncomp.selected] > 0) - { + if (rmse > m$cvres$rmse[1, m$ncomp.selected]) { ncomp = m$ncomp.selected - rmse = m$cvres$rmse[1, ncomp] - r2 = m$cvres$r2[1, ncomp] + rmse = m$cvres$rmse[1, m$ncomp.selected] + r2 = m$cvres$r2[1, m$ncomp.selected] sel = l } } + if (!is.null(sel)) { selind = c(selind, obj$int.limits[sel, 1]:obj$int.limits[sel, 2]) int.nonselected = int.nonselected[int.nonselected != sel] @@ -379,32 +379,22 @@ ipls.forward = function(x, y, obj) { 'n' = sel, 'start' = obj$int.limits[sel, 1], 'end' = obj$int.limits[sel, 2], - 'selected' = F, + 'selected' = T, 'nComp' = ncomp, 'RMSE' = rmse, 'R2' = r2 - )) + )) if (!obj$silent) cat(sprintf('selected interval %3d (RMSECV = %f)\n', sel, rmse)) - } else { # no improvements, quit the outer loop + if (!obj$silent) + cat('no improvements, stop.\n') break } } - # find which variables to select using first local minimum - df = diff(int.stat$RMSE[2:nrow(int.stat)]) > 0 - nsel = which(df)[1] + 1 - - if (any(df)) - isel = 2:nsel - else - isel = 2:nrow(int.stat) - int.selected = int.stat$n[isel] - int.stat$selected[isel] = TRUE - # return the selection results obj$glob.stat = glob.stat obj$int.stat = int.stat @@ -452,6 +442,7 @@ ipls.backward = function(x, y, obj) { # do loop for max number of intervals unselind = NULL + rmse = Inf for (i in 1:obj$int.niter) { if (length(int.selected) == 1) break @@ -460,9 +451,7 @@ ipls.backward = function(x, y, obj) { cat(sprintf('Iteration %3d/%3d... ', i, obj$int.niter)) # do loop to select an interval - unsel = NULL - rmse = 99999999999999 for (l in int.selected) { # combine already selected intervals with the current ind = obj$int.limits[l, 1]:obj$int.limits[l, 2] @@ -473,8 +462,7 @@ ipls.backward = function(x, y, obj) { cv = obj$cv, light = TRUE, ncomp.selcrit = obj$ncomp.selcrit) # if first round, build a data frame with statistics for each interval - if (i == 1) - { + if (i == 1) { glob.stat = rbind(glob.stat, data.frame( 'n' = l, @@ -505,11 +493,11 @@ ipls.backward = function(x, y, obj) { 'R2' = m$cvres$r2[1, m$ncomp.selected] )) unsel = NULL - } else if (rmse - m$cvres$rmse[1, m$ncomp.selected] > 0) { + } else if (rmse > m$cvres$rmse[1, m$ncomp.selected]) { # else check if rmse has been improved ncomp = m$ncomp.selected - rmse = m$cvres$rmse[1, ncomp] - r2 = m$cvres$r2[1, ncomp] + rmse = m$cvres$rmse[1, m$ncomp.selected] + r2 = m$cvres$r2[1, m$ncomp.selected] unsel = l } } @@ -534,32 +522,12 @@ ipls.backward = function(x, y, obj) { } else { # no improvements, quit the outer loop + if (!obj$silent) + cat('no improvements, stop.\n') break } } - # sort last two rows if all intervals were processed - if (obj$int.niter == obj$int.num) { - nr = nrow(int.stat) - if (int.stat$RMSE[nr] < int.stat$RMSE[nr - 1]) { - a = int.stat[nr, ] - int.stat[nr, ] = int.stat[nr - 1, ] - int.stat[nr - 1, ] = a - } - } - - # find which variables to select using first local minimum - df = diff(int.stat$RMSE[2:nrow(int.stat)]) > 0 - nsel = which(df)[1] + 2 - - if (any(df)) - isel = nsel:nrow(int.stat) - else - isel = 2:nrow(int.stat) - - int.selected = int.stat$n[isel] - int.stat$selected[isel] = TRUE - # return the selection results obj$glob.stat = glob.stat obj$int.stat = int.stat @@ -640,20 +608,20 @@ plotSelection.ipls = function(obj, glob.ncomp = NULL, main = 'iPLS results', bars(mids, rmse, col = rgb(0.9, 0.9, 0.9), bwd = bwd, border = rgb(0.8, 0.8, 0.8)) bars(mids[obj$int.selected], rmse[obj$int.selected], col = rgb(0.5, 1.0, 0.6), bwd = bwd[obj$int.selected], border = rgb(0.4, 0.9, 0.5)) - + # mean signal lines(xlabels, xmean, col = rgb(1.0, 0.7, 0.7), lwd = 2) - + # number of components for each interval text(mids, matrix(0.05 * ylim[2], ncol = length(mids)), ncomp, col = rgb(0.4, 0.4, 0.4), cex = 0.85) - + # global model if (is.null(glob.ncomp)) glob.ncomp = obj$gm$ncomp.selected else if (glob.ncomp < 1 || glob.ncomp > obj$gm$ncomp) stop('Wrong value for number of components!') - + dx = (xlim[2] - xlim[1])/50 abline(h = obj$gm$cvres$rmse[1, glob.ncomp], lty = 2, col = rgb(0.5, 0.5, 0.5)) text(xlim[2] + dx, obj$gm$cvres$rmse[1, glob.ncomp], glob.ncomp, cex = 0.85, @@ -684,11 +652,10 @@ plotSelection.ipls = function(obj, glob.ncomp = NULL, main = 'iPLS results', #' other arguments #' #' @details -#' The plot shows RMSE values obtained at each iteration of the iPLS selection -#' algorithm as bars. The first bar correspond to the global model with all variables -#' included, second - to the model obtained at the first iteration and so on. Number -#' at the bottom of each bar corresponds to the interval included or excluded at the -#' particular iteration. The selected intervals are shown with green color. +#' The plot shows RMSE values obtained at each iteration of the iPLS algorithm as bars. The first +#' bar correspond to the global model with all variables included, second - to the model obtained +#' at the first iteration and so on. Number at the bottom of each bar corresponds to the interval +#' included or excluded at the particular iteration. #' #' @seealso #' \code{\link{summary.ipls}}, \code{\link{plotSelection.ipls}} @@ -704,23 +671,21 @@ plotRMSE.ipls = function(obj, glob.ncomp = NULL, main = 'RMSE development', xlab rmse = obj$int.stat$RMSE n = obj$int.stat$n - i = obj$int.stat$selected mids = 0:(length(n) - 1) - + if (is.null(xlim)) xlim = c(min(mids) - 0.5, max(mids) + 0.5) if (is.null(ylim)) ylim = c(0, max(rmse) * 1.1) - + # make plot plot(0, 0, type = 'n', main = main, xlab = xlab, ylab = ylab, xlim = xlim, ylim = ylim, ...) - + # gray and green bars - bars(mids, rmse, col = rgb(0.9, 0.9, 0.9), bwd = 1, border = rgb(0.8, 0.8, 0.8)) + bars(mids, rmse, col = rgb(0.5, 1.0, 0.6), bwd = 1, border = rgb(0.4, 0.9, 0.5)) bars(mids[1], rmse[1], col = rgb(0.98, 0.98, 0.98), bwd = 1, border = rgb(0.85, 0.85, 0.85)) - bars(mids[i], rmse[i], col = rgb(0.5, 1.0, 0.6), bwd = 1, border = rgb(0.4, 0.9, 0.5)) - - # number of components for each interval + + # interval numbers text(mids[-1], ylim[1] + (ylim[2] - ylim[1])/25, n[-1], col = rgb(0.4, 0.4, 0.4), cex = 0.80) } From 370098a641d5aec785d19480d4e66239799f6d72 Mon Sep 17 00:00:00 2001 From: Sergey Kucheryavskiy Date: Fri, 9 Nov 2018 14:37:07 +0100 Subject: [PATCH 7/8] improved and extended tests for prep.autoscale() --- tests/testthat/test_prep.R | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test_prep.R b/tests/testthat/test_prep.R index ac3ff60..fde7aac 100644 --- a/tests/testthat/test_prep.R +++ b/tests/testthat/test_prep.R @@ -13,11 +13,11 @@ p13 = prep.autoscale(X1, center = T, scale = T) # spectra with constant and small variation variables X2 = simdata$spectra.c X2[, 20:50] = matrix(apply(X2[, 20:50], 2, mean), nrow = nrow(X2), ncol = 31, byrow = T) # constant -X2[, 51:60] = X2[, 51:60] * 0.001 + 0.1 # small CV +X2[, 51:60] = X2[, 51:60] * 0.001 + 0.1 # small CV (around 0.17%) p21 = prep.autoscale(X2) p22 = prep.autoscale(X2, center = T, scale = F) p23 = prep.autoscale(X2, center = T, scale = T) -p24 = prep.autoscale(X2, center = T, scale = T, max.cov = 1) +p24 = prep.autoscale(X2, center = T, scale = T, max.cov = 0.2) test_that("Default autoscaling is correct", { expect_equal(attr(p11, 'prep:center'), apply(X1, 2, mean)) @@ -40,6 +40,12 @@ test_that("Default autoscaling for data with constant variables is correct", { }) test_that("Full autoscaling for data with constant variables is correct", { - expect_equal(attr(p23, 'prep:center'), apply(X1, 2, mean)) - expect_equal(attr(p23, 'prep:scale')[-(20:50)], apply(X1, 2, sd)[-(20:50)]) + expect_equal(attr(p23, 'prep:center'), apply(X2, 2, mean)) + expect_equal(attr(p23, 'prep:scale')[-(20:50)], apply(X2, 2, sd)[-(20:50)]) + expect_equal(attr(p23, 'prep:scale')[(20:50)], rep(1, 31), check.attributes = FALSE) +}) + +test_that("Full autoscaling with limits for max.cov is correct", { + expect_equal(attr(p24, 'prep:scale')[-(20:60)], apply(X2, 2, sd)[-(20:60)]) + expect_equal(attr(p24, 'prep:scale')[(20:60)], rep(1, 41), check.attributes = FALSE) }) From 9398bf7bd2a346eabd70b5b33848f17fa8fc5e76 Mon Sep 17 00:00:00 2001 From: Sergey Kucheryavskiy Date: Fri, 9 Nov 2018 14:37:27 +0100 Subject: [PATCH 8/8] text is amended according to the new release --- DESCRIPTION | 4 ++-- NEWS.md | 6 ++++++ README.md | 18 ++++++------------ 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8388c5f..70cbfc3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: mdatools Title: Multivariate Data Analysis for Chemometrics -Version: 0.9.1 -Date: 2018-07-06 +Version: 0.9.2 +Date: 2018-11-09 Author: Sergey Kucheryavskiy Maintainer: Sergey Kucheryavskiy Description: Package implements projection based methods for preprocessing, diff --git a/NEWS.md b/NEWS.md index 29b30b8..b3adeea 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,11 @@ v.0.9.2 ======= +* improvements to `ipls()` method plus fixed a bug preventing breaking the selection loop (#56) +* fixed a bug in `selectCompNum()` related to use of Wold criterion (#57) +* fixed a bug with using of `max.cov` parameter in `prep.autoscale()` (#58) +* default `max.cov` value in `prep.autoscale()` is set to 0 (to avoid scaling only of constant variables) +* code refactoring and small improvements +* added tests for `prep.autoscale()` v.0.9.1 ======= diff --git a/README.md b/README.md index 85a945d..123298c 100755 --- a/README.md +++ b/README.md @@ -1,21 +1,16 @@ Multivariate Data Analysis Tools =========================================== -mdatools is an R package for preprocessing, exploring and analysis of multivariate data. The package -provides methods mostly common for [Chemometrics](http://en.wikipedia.org/wiki/Chemometrics). It was -created for an introductory PhD course on Chemometrics given at Section of Chemical Engineering, -Aalborg University. +*mdatools* is an R package for preprocessing, exploring and analysis of multivariate data. The package provides methods mostly common for [Chemometrics](http://en.wikipedia.org/wiki/Chemometrics). It was created for an introductory PhD course on Chemometrics given at Section of Chemical Engineering, Aalborg University. -The general idea of the package is to collect most widespread chemometric methods and give a similar -"user interface" for using them. So if a user knows how to make a model and visualise results for one -method, he or she can easily do this for the others. +The general idea of the package is to collect most widespread chemometric methods and give a similar "user interface" (or rather API) for using them. So if a user knows how to make a model and visualise results for one method, he or she can easily do this for the others. For more details and examples read a [Bookdown tutorial](http://svkucheryavski.github.io/mdatools/). What is new ----------- -New minor release (0.9.1) is available both from GitHub and CRAN (from 07.07.2018). +New minor release (0.9.2) is available both from GitHub and CRAN (from 07.07.2018). The latest major release (0.9.0) brings a set of new features, including methods for computing of critical limits for PCA/SIMCA residuals, adjuested residuals plot, and randomized algorithms for fast PCA decomposition of dataset with large number of rows. The text of tutorial has been amended correspondingly and now also includes a new chapter with detailed explanation of calculation of the critical limits. @@ -25,14 +20,13 @@ A full list of changes is available [here](NEWS.md) How to install -------------- -The package is available from CRAN by usual installing procedure. However due to restrictions in CRAN politics regarding number of submissions (one in 3-4 month) only major releases will be published there (with 2-3 weeks delay after GitHub release as more thorought testing is needed). To get the latest release plase use [GitHub sources](https://github.com/svkucheryavski/mdatools). You can [download](https://github.com/svkucheryavski/mdatools/releases) a zip-file with source package and install it using the `install.packages` command, e.g. if the downloaded file is `mdatools_0.9.1.tar.gz` and it is located in a current working directory, just run the following: +The package is available from CRAN by usual installing procedure. However due to restrictions in CRAN politics regarding number of submissions (one in 3-4 month) only major releases will be published there (with 2-3 weeks delay after GitHub release as more thorought testing is needed). To get the latest release plase use [GitHub sources](https://github.com/svkucheryavski/mdatools). You can [download](https://github.com/svkucheryavski/mdatools/releases) a zip-file with source package and install it using the `install.packages` command, e.g. if the downloaded file is `mdatools_0.9.2.tar.gz` and it is located in a current working directory, just run the following: ``` -install.packages('mdatools_0.9.1.tar.gz') +install.packages('mdatools_0.9.2.tar.gz') ``` -If you have `devtools` package installed, the following command will install the latest release from -the master branch of GitHub repository (do not forget to load the `devtools` package first): +If you have `devtools` package installed, the following command will install the latest release from the master branch of GitHub repository (do not forget to load the `devtools` package first): ``` install_github('svkucheryavski/mdatools')