From f13cfa1cc7bdd513dfd8865aa1db3a7d8e2a9f26 Mon Sep 17 00:00:00 2001 From: Lennart Date: Thu, 7 Jan 2021 11:39:05 +0100 Subject: [PATCH 1/5] made robust vs small training sets for testing purposes --- PREFACE.R | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/PREFACE.R b/PREFACE.R index 5c39be1..d07e989 100755 --- a/PREFACE.R +++ b/PREFACE.R @@ -132,6 +132,7 @@ train <- function(args){ suppressMessages(library('foreach')) suppressMessages(library('doParallel')) suppressMessages(library('MASS')) + suppressMessages(library('irlba')) # Arg parse @@ -203,7 +204,8 @@ train <- function(args){ cat(paste0('Creating training frame ...\n')) - training.frame <- cbind(training.frame[!(training.frame$chr %in% exclude.chrs),], training.frame.sub[!(training.frame$chr %in% exclude.chrs),]) + training.frame <- cbind(training.frame[!(training.frame$chr %in% exclude.chrs),], + training.frame.sub[!(training.frame$chr %in% exclude.chrs),]) training.frame.t <- t(training.frame[4:ncol(training.frame)]) colnames(training.frame.t) <- paste0(training.frame$chr, ':', training.frame$start, '-', training.frame$end) @@ -215,7 +217,7 @@ train <- function(args){ mean.features <- colMeans(training.frame, na.rm = T) na.index <- which(is.na(training.frame), arr.ind=TRUE) - training.frame[na.index] <- mean.features[na.index[,2]] + if (length(na.index[,2])) training.frame[na.index] <- mean.features[na.index[,2]] cat(paste0('Remaining training features after \'NA\' filtering: ', length(possible.features), '\n')) @@ -228,6 +230,12 @@ train <- function(args){ test.number = length(which(config.file$gender %in% train.gender)) * test.percentage + max.feat <- length(which(config.file$gender %in% train.gender)) - as.integer(test.number) - 1 + if (n.feat > max.feat){ + cat(paste0('Too few samples were provided for --nfeat ', n.feat, ', using --nfeat ', max.feat, '\n')) + n.feat <- max.feat + } + oper <- foreach(i = 1:repeats) %dopar% { cat(paste0('Model training | Repeat ', i,'/', repeats, ' ...\n')) @@ -237,7 +245,9 @@ train <- function(args){ train.index.subset <- sort(which(config.file$gender[-test.index.overall] %in% train.gender)) cat(paste0('\tExecuting principal component analysis ...\n')) - pca.train <- prcomp(training.frame[-test.index.overall,]) + pca.train <- suppressWarnings(prcomp_irlba(training.frame[-test.index.overall,], + n = min(n.feat * 2, length(train.index.overall)), scale. = F)) + X.train <- as.matrix(pca.train$x[train.index.subset, ]) Y.train <- as.matrix(config.file$FF[train.index.overall], ncol = 1) X.test <- as.matrix(scale(training.frame[test.index.overall,], pca.train$center, pca.train$scale) %*% pca.train$rotation) @@ -336,7 +346,7 @@ train <- function(args){ predictions <- the.intercept + the.slope * predictions cat(paste0('Executing final principal component analysis ...\n')) - pca.train <- prcomp(training.frame) + pca.train <- suppressWarnings(prcomp_irlba(training.frame, n = min(n.feat * 2, nrow(training.frame) - 1), scale. = F)) X.train <- as.matrix(pca.train$x[which(config.file$gender %in% train.gender), ]) Y.train <- as.matrix(config.file$FF[which(config.file$gender %in% train.gender)], ncol = 1) From 2e6416a17a0cc5a80703d8449292ad4d42b01fe7 Mon Sep 17 00:00:00 2001 From: Lennart Date: Thu, 7 Jan 2021 11:39:19 +0100 Subject: [PATCH 2/5] bumped version --- PREFACE.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PREFACE.R b/PREFACE.R index d07e989..1197fb2 100755 --- a/PREFACE.R +++ b/PREFACE.R @@ -1,4 +1,4 @@ -version = 'v0.1.1' +version = 'v0.1.2' # --- # Functions From 509ec09d99551c29801efb344a92ea887570c847 Mon Sep 17 00:00:00 2001 From: Lennart Date: Thu, 7 Jan 2021 11:44:58 +0100 Subject: [PATCH 3/5] made robust vs small training sets for testing purposes --- PREFACE.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PREFACE.R b/PREFACE.R index 1197fb2..14ba1fc 100755 --- a/PREFACE.R +++ b/PREFACE.R @@ -246,7 +246,7 @@ train <- function(args){ cat(paste0('\tExecuting principal component analysis ...\n')) pca.train <- suppressWarnings(prcomp_irlba(training.frame[-test.index.overall,], - n = min(n.feat * 2, length(train.index.overall)), scale. = F)) + n = min(n.feat * 2, nrow(training.frame[-test.index.overall,]) - 1), scale. = F)) X.train <- as.matrix(pca.train$x[train.index.subset, ]) Y.train <- as.matrix(config.file$FF[train.index.overall], ncol = 1) From 4d8c2f18deaae8cdedc12ff9324ea998e6469749 Mon Sep 17 00:00:00 2001 From: Lennart Date: Thu, 7 Jan 2021 11:48:52 +0100 Subject: [PATCH 4/5] added dependency --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a825fa9..d248c78 100644 --- a/README.md +++ b/README.md @@ -75,10 +75,11 @@ RScript PREFACE.R predict --infile path/to/infile.bed --model path/to/model.RDat - glmnet (v2.0-16) - data.table (v1.11.8) - MASS (v7.3-49) +- irlba (v2.3.3) Other versions are of course expected to work equally well. To install within R use: ```bash -install.packages(c('data.table', 'glmnet', 'neuralnet', 'foreach', 'doParallel', 'MASS')) +install.packages(c('data.table', 'glmnet', 'neuralnet', 'foreach', 'doParallel', 'MASS', 'irlba')) ``` From 27e5b8ee1f0d95080a045982c08268f890d393f1 Mon Sep 17 00:00:00 2001 From: Lennart Date: Thu, 7 Jan 2021 11:56:15 +0100 Subject: [PATCH 5/5] increased components for plotting purposes --- PREFACE.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PREFACE.R b/PREFACE.R index 14ba1fc..68c09bd 100755 --- a/PREFACE.R +++ b/PREFACE.R @@ -246,7 +246,7 @@ train <- function(args){ cat(paste0('\tExecuting principal component analysis ...\n')) pca.train <- suppressWarnings(prcomp_irlba(training.frame[-test.index.overall,], - n = min(n.feat * 2, nrow(training.frame[-test.index.overall,]) - 1), scale. = F)) + n = min(n.feat * 10, nrow(training.frame[-test.index.overall,]) - 1), scale. = F)) X.train <- as.matrix(pca.train$x[train.index.subset, ]) Y.train <- as.matrix(config.file$FF[train.index.overall], ncol = 1) @@ -346,7 +346,7 @@ train <- function(args){ predictions <- the.intercept + the.slope * predictions cat(paste0('Executing final principal component analysis ...\n')) - pca.train <- suppressWarnings(prcomp_irlba(training.frame, n = min(n.feat * 2, nrow(training.frame) - 1), scale. = F)) + pca.train <- suppressWarnings(prcomp_irlba(training.frame, n = min(n.feat * 10, nrow(training.frame) - 1), scale. = F)) X.train <- as.matrix(pca.train$x[which(config.file$gender %in% train.gender), ]) Y.train <- as.matrix(config.file$FF[which(config.file$gender %in% train.gender)], ncol = 1)