From f13cfa1cc7bdd513dfd8865aa1db3a7d8e2a9f26 Mon Sep 17 00:00:00 2001
From: Lennart <leraman@MacBook-Pro-Lennart.local>
Date: Thu, 7 Jan 2021 11:39:05 +0100
Subject: [PATCH 1/5] made robust vs small training sets for testing purposes

---
 PREFACE.R | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/PREFACE.R b/PREFACE.R
index 5c39be1..d07e989 100755
--- a/PREFACE.R
+++ b/PREFACE.R
@@ -132,6 +132,7 @@ train <- function(args){
   suppressMessages(library('foreach'))
   suppressMessages(library('doParallel'))
   suppressMessages(library('MASS'))
+  suppressMessages(library('irlba'))
   
   # Arg parse
   
@@ -203,7 +204,8 @@ train <- function(args){
 
   cat(paste0('Creating training frame ...\n'))
   
-  training.frame <- cbind(training.frame[!(training.frame$chr %in% exclude.chrs),], training.frame.sub[!(training.frame$chr %in% exclude.chrs),])
+  training.frame <- cbind(training.frame[!(training.frame$chr %in% exclude.chrs),],
+                          training.frame.sub[!(training.frame$chr %in% exclude.chrs),])
   training.frame.t <- t(training.frame[4:ncol(training.frame)])
   colnames(training.frame.t) <- paste0(training.frame$chr, ':', training.frame$start, '-', training.frame$end)
   
@@ -215,7 +217,7 @@ train <- function(args){
   mean.features <- colMeans(training.frame, na.rm = T)
   
   na.index <- which(is.na(training.frame), arr.ind=TRUE)
-  training.frame[na.index] <- mean.features[na.index[,2]]
+  if (length(na.index[,2])) training.frame[na.index] <- mean.features[na.index[,2]]
   
   cat(paste0('Remaining training features after \'NA\' filtering: ', length(possible.features), '\n'))
   
@@ -228,6 +230,12 @@ train <- function(args){
   
   test.number = length(which(config.file$gender %in% train.gender)) * test.percentage
   
+  max.feat <- length(which(config.file$gender %in% train.gender)) - as.integer(test.number) - 1
+  if (n.feat > max.feat){
+    cat(paste0('Too few samples were provided for --nfeat ', n.feat, ', using --nfeat ', max.feat, '\n'))
+    n.feat <- max.feat
+  }
+  
   oper <- foreach(i = 1:repeats) %dopar% {
 
     cat(paste0('Model training | Repeat ', i,'/', repeats, ' ...\n'))
@@ -237,7 +245,9 @@ train <- function(args){
     train.index.subset <- sort(which(config.file$gender[-test.index.overall] %in% train.gender))
     
     cat(paste0('\tExecuting principal component analysis ...\n'))
-    pca.train <- prcomp(training.frame[-test.index.overall,])
+    pca.train <- suppressWarnings(prcomp_irlba(training.frame[-test.index.overall,],
+                              n = min(n.feat * 2, length(train.index.overall)), scale. = F))
+    
     X.train <- as.matrix(pca.train$x[train.index.subset, ])
     Y.train <- as.matrix(config.file$FF[train.index.overall], ncol = 1)
     X.test <- as.matrix(scale(training.frame[test.index.overall,], pca.train$center, pca.train$scale) %*% pca.train$rotation)
@@ -336,7 +346,7 @@ train <- function(args){
   predictions <- the.intercept + the.slope * predictions
   
   cat(paste0('Executing final principal component analysis ...\n'))
-  pca.train <- prcomp(training.frame)
+  pca.train <- suppressWarnings(prcomp_irlba(training.frame, n = min(n.feat * 2, nrow(training.frame) - 1), scale. = F))
   X.train <- as.matrix(pca.train$x[which(config.file$gender %in% train.gender), ])
   Y.train <- as.matrix(config.file$FF[which(config.file$gender %in% train.gender)], ncol = 1)
   

From 2e6416a17a0cc5a80703d8449292ad4d42b01fe7 Mon Sep 17 00:00:00 2001
From: Lennart <leraman@MacBook-Pro-Lennart.local>
Date: Thu, 7 Jan 2021 11:39:19 +0100
Subject: [PATCH 2/5] bumped version

---
 PREFACE.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PREFACE.R b/PREFACE.R
index d07e989..1197fb2 100755
--- a/PREFACE.R
+++ b/PREFACE.R
@@ -1,4 +1,4 @@
-version = 'v0.1.1'
+version = 'v0.1.2'
 
 # ---
 # Functions

From 509ec09d99551c29801efb344a92ea887570c847 Mon Sep 17 00:00:00 2001
From: Lennart <leraman@MacBook-Pro-Lennart.local>
Date: Thu, 7 Jan 2021 11:44:58 +0100
Subject: [PATCH 3/5] made robust vs small training sets for testing purposes

---
 PREFACE.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PREFACE.R b/PREFACE.R
index 1197fb2..14ba1fc 100755
--- a/PREFACE.R
+++ b/PREFACE.R
@@ -246,7 +246,7 @@ train <- function(args){
     
     cat(paste0('\tExecuting principal component analysis ...\n'))
     pca.train <- suppressWarnings(prcomp_irlba(training.frame[-test.index.overall,],
-                              n = min(n.feat * 2, length(train.index.overall)), scale. = F))
+                              n = min(n.feat * 2, nrow(training.frame[-test.index.overall,]) - 1), scale. = F))
     
     X.train <- as.matrix(pca.train$x[train.index.subset, ])
     Y.train <- as.matrix(config.file$FF[train.index.overall], ncol = 1)

From 4d8c2f18deaae8cdedc12ff9324ea998e6469749 Mon Sep 17 00:00:00 2001
From: Lennart <leraman@MacBook-Pro-Lennart.local>
Date: Thu, 7 Jan 2021 11:48:52 +0100
Subject: [PATCH 4/5] added dependency

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a825fa9..d248c78 100644
--- a/README.md
+++ b/README.md
@@ -75,10 +75,11 @@ RScript PREFACE.R predict --infile path/to/infile.bed --model path/to/model.RDat
 - glmnet (v2.0-16)  
 - data.table (v1.11.8)  
 - MASS (v7.3-49)  
+- irlba (v2.3.3)  
 
 Other versions are of course expected to work equally well. To install within R use:  
 
 ```bash
 
-install.packages(c('data.table', 'glmnet', 'neuralnet', 'foreach', 'doParallel', 'MASS'))
+install.packages(c('data.table', 'glmnet', 'neuralnet', 'foreach', 'doParallel', 'MASS', 'irlba'))
 ```

From 27e5b8ee1f0d95080a045982c08268f890d393f1 Mon Sep 17 00:00:00 2001
From: Lennart <leraman@MacBook-Pro-Lennart.local>
Date: Thu, 7 Jan 2021 11:56:15 +0100
Subject: [PATCH 5/5] increased components for plotting purposes

---
 PREFACE.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PREFACE.R b/PREFACE.R
index 14ba1fc..68c09bd 100755
--- a/PREFACE.R
+++ b/PREFACE.R
@@ -246,7 +246,7 @@ train <- function(args){
     
     cat(paste0('\tExecuting principal component analysis ...\n'))
     pca.train <- suppressWarnings(prcomp_irlba(training.frame[-test.index.overall,],
-                              n = min(n.feat * 2, nrow(training.frame[-test.index.overall,]) - 1), scale. = F))
+                              n = min(n.feat * 10, nrow(training.frame[-test.index.overall,]) - 1), scale. = F))
     
     X.train <- as.matrix(pca.train$x[train.index.subset, ])
     Y.train <- as.matrix(config.file$FF[train.index.overall], ncol = 1)
@@ -346,7 +346,7 @@ train <- function(args){
   predictions <- the.intercept + the.slope * predictions
   
   cat(paste0('Executing final principal component analysis ...\n'))
-  pca.train <- suppressWarnings(prcomp_irlba(training.frame, n = min(n.feat * 2, nrow(training.frame) - 1), scale. = F))
+  pca.train <- suppressWarnings(prcomp_irlba(training.frame, n = min(n.feat * 10, nrow(training.frame) - 1), scale. = F))
   X.train <- as.matrix(pca.train$x[which(config.file$gender %in% train.gender), ])
   Y.train <- as.matrix(config.file$FF[which(config.file$gender %in% train.gender)], ncol = 1)