From 374ee18598725bece0139dbbcfd83aecc99ca69f Mon Sep 17 00:00:00 2001 From: Lennart Date: Thu, 27 Jun 2019 15:44:40 +0200 Subject: [PATCH] Updated symbols | updated readme/output --- PREFACE.R | 18 +++++++++--------- README.md | 48 +++++++++++++++++++++++------------------------- 2 files changed, 32 insertions(+), 34 deletions(-) diff --git a/PREFACE.R b/PREFACE.R index 20988f4..5c39be1 100755 --- a/PREFACE.R +++ b/PREFACE.R @@ -1,4 +1,4 @@ -version = 'v0.1.0' +version = 'v0.1.1' # --- # Functions @@ -89,7 +89,7 @@ plot.performance <- function(v1, v2, summary, n.feat, xlab, ylab, path){ par(xpd=NA) text(0, mx * 1.03, - paste0('(ρ = ', signif(cor(v1, v2), 3), ')'), + paste0('(r = ', signif(cor(v1, v2), 3), ')'), cex = 0.9, adj = 0) t = hist(v1-v2, max(20,length(v1)/10), axes = F, xlab = paste0(xlab, ' - ', ylab), main = 'Histogram', ylab = 'Density', c = 'black') @@ -114,7 +114,7 @@ train.neural <- function(f, train.nn, hidden){ tryCatch({ return(neuralnet(f, train.nn, hidden = hidden, stepmax = 1e6)) }, warning = function(e) { - cat(paste0('Neural network did not converge. Re-run and decrease --hidden or --nfeat. Alternatively, use --olm.\n')) + cat(paste0('Neural network did not converge. Re-run and decrease --hidden or optimize --nfeat. Alternatively, use --olm.\n')) quit(save = 'no') }) } @@ -194,7 +194,7 @@ train <- function(args){ sample <- config.file$ID[i] cat(paste0('Loading sample ', sample, ' | ', nrow(config.file) - i, '/', nrow(config.file), ' remaining ...\n')) bin.table <- fread(config.file$filepath[i], header = T, sep = '\t') - return(as.numeric(bin.table$ratio[bin.table$chr != 'Y'])) + return(suppressWarnings(as.numeric(bin.table$ratio[bin.table$chr != 'Y']))) } colnames(training.frame.sub) <- config.file$ID @@ -217,7 +217,7 @@ train <- function(args){ na.index <- which(is.na(training.frame), arr.ind=TRUE) training.frame[na.index] <- mean.features[na.index[,2]] - cat(paste0('Remaining training features after \'NaN\' filtering: ', length(possible.features), '\n')) + cat(paste0('Remaining training features after \'NA\' filtering: ', length(possible.features), '\n')) ## Predictive modeling @@ -312,7 +312,7 @@ train <- function(args){ axis(1, tcl=0.5) axis(2, tcl=0.5, las = 2) - legend('topright', legend = c('RLS fit', 'f(x)=x', paste0('(wρ = ', signif(r, 4), ')')), + legend('topright', legend = c('RLS fit', 'f(x)=x', paste0('(wr = ', signif(r, 4), ')')), bty = 'n', lty = c(2, 3, -1), col = c(color.A, color.B, 'black'), cex = 0.7, text.col = c(color.A, color.B, 'black'), text.font = c(2, 2, 1)) @@ -372,8 +372,8 @@ train <- function(args){ cat('PREFACE - PREdict FetAl ComponEnt\n\n') if (length(outliers) != 0){ cat(paste0('Below, some of the top candidates for outlier removal are listed.\n', - 'If you know some of these are low quality/have sex aberrations (when using FFY), remove them from the config file and re-run.\n', - 'Avoid removing other cases, as this will result in inaccurate performance statistics and possible overfitting towards unrelevant models.\n\n')) + 'If you know some of these are low quality/have sex aberrations (when using FFY as response variable), remove them from the config file and re-run.\n', + 'Avoid removing other cases, as this will result in inaccurate performance statistics and possible overfitting towards irrelevant models.\n\n')) cat('_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n') cat('ID\tFF (%) - PREFACE (%)\n') for (i in rev(order(outlier.values))){ @@ -383,7 +383,7 @@ train <- function(args){ } elapsed.time <- proc.time() - start.time cat(paste0('Training time: ', as.character(round(elapsed.time[3])), ' seconds\n')) - cat(paste0('Overall correlation (ρ): ', info[5], '\n')) + cat(paste0('Overall correlation (r): ', info[5], '\n')) cat(paste0('Overall mean absolute error (MAE): ', info[3], ' ± ', info[4], '\n')) cat(paste0('FF < 10% mean absolute error (MAE): ', mean(deviations.10), ' ± ', sd(deviations.10), '\n')) cat(paste0('Correction for skew: \n')) diff --git a/README.md b/README.md index 666b4bc..a825fa9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Background -Cell-free fetal DNA fraction is an important estimate during routine NIPT. Its arguably most important role is informing geneticists whether a test is conclusive: if the fraction of fetal-derived fragments is insufficient (this limit has often been debated to be 4%) statements on fetal aneuploidies cannot be made accurately. Furthermore, the fetal fraction carries valuable inherent information on whether observed potential aberrations are fetal-derived. Several techniques exist to deduce this figure, but the far most require additional experimental procedures, which impede routine execution in times of rising NIPT demand. Therefore, we set out to develop PREFACE, a software to accurately predict fetal fraction based on solely shallow-depth whole-genome sequencing data, which is the fundamental base of a default NIPT assay. In contrast to previous efforts, PREFACE enables user-friendly model training with a limited amount of retrospective data, sidelining between-laboratory bias. For sets of roughly 1100 male NIPT samples, a cross-validated correlation of 0.9 between predictions and fetal fractions according to Y chromosomal read counts was noted (FFY). The approach enables training with both male and unlabeled female feti. Using our complete cohort (nfemale=2468, nmale=2723), the correlation metric reached 0.94. To date, no shallow-depth whole-genome sequencing based software has been reported to perform better. In addition, PREFACE provides the fetal fraction based on the copy number state of chromosome X (FFX). When combined, PREFACE values, FFX and FFY predict mixed multiple pregnancies and sex chromosomal aneuploidies. Details can be found in the corresponding [paper](https://www.ncbi.nlm.nih.gov/pubmed/31219182). +A share of all cell-free DNA fragments isolated from maternal plasma during pregnancy is fetal-derived. This amount is referred to as the 'fetal fraction' and represents an important estimate during routine noninvasive prenatal testing (NIPT). Its most essential role is informing geneticists whether an assay is conclusive: if the fetal fraction is insufficient (this limit has often been debated to be 4%) claims on fetal aneuploidies cannot be made accurately. Several techniques exist to deduce this figure, but the far most require additional experimental procedures, which impede routine execution. Therefore, we set out to develop PREFACE, a software to accurately predict fetal fraction based on solely shallow-depth whole-genome sequencing data, which is the fundamental base of a default NIPT assay. In contrast to previous efforts, PREFACE enables user-friendly model training with a limited amount of retrospective data, which eliminates between-laboratory bias. For sets of roughly 1100 male NIPT samples, a cross-validated correlation of 0.9 between predictions and fetal fractions according to Y chromosomal read counts was noted (FFY). Our approach enables training with both male and unlabeled female fetuses: using our complete cohort (nfemale=2468, nmale=2723), the correlation metric reached 0.94. In addition, PREFACE provides the fetal fraction based on the copy number state of chromosome X (FFX). The presented statistics indirectly predict mixed multiple pregnancies, the source of observed events and sex chromosomal aneuploidies. All details can be found in our [corresponding paper](https://www.ncbi.nlm.nih.gov/pubmed/31219182). # Manual @@ -8,28 +8,27 @@ Cell-free fetal DNA fraction is an important estimate during routine NIPT. Its a ### Copy number alteration .bed files -Each sample (whether it is used for training or for predicting) should be passed to PREFACE in the following format. During benchmarking, copy number normalization was executed by [WisecondorX](https://github.com/CenterForMedicalGeneticsGhent/WisecondorX/), yet PREFACE is not limited to any copy number alteration software. Note that the default output of WisecondorX is directly interpretable by PREFACE. +Each sample (whether it is used for training or for predicting) should be passed to PREFACE in the format shown below. During benchmarking, using a bin size of 100 kb (others might work equally well), copy number normalization was performed by [WisecondorX](https://github.com/CenterForMedicalGeneticsGhent/WisecondorX/), yet PREFACE is not limited to any copy number alteration software, however, the default output of WisecondorX is directly interpretable by PREFACE. - Example: ```./examples/infile.bed``` - Tab-separated file with at least four columns. - The name of these columns (passed as a header) must be 'chr', 'start', 'end' and 'ratio'. - - The possible values for 'chr' are 1 until 22 and X and Y (uppercase). - - 'ratio' corresponds to the log2-transformed ratio between the observed and expected copy number. - - Loci can be indeterminable (e.g. at repeats). Here, 'ratio' values should be expressed as 'NaN'. -- The order of rows does not matter. Yet, it is of paramount importance that, for a certain line, file x deals with the same locus as file y. This implies, of course, that all copy number alteration files have the same number of lines. -- PREFACE was validated using a bin size of 100 kb, lower bin sizes are expected to perform at least equally well. + - The possible values of 'chr' are 1 until 22, and X and Y (uppercase). + - The 'ratio' column contains the log2-transformed ratio between the observed and expected copy number. + - The ratio can be unknown at certain loci (e.g. often seen at centromeres). Here, values should be expressed as 'NaN' or 'NA'. +- The order of rows does not matter. Yet, it is paramount that, for a certain line, file x deals with the same locus as file y. This implies, of course, that all copy number alteration files have the same number of lines. ### PREFACE's config.txt -For training, PREFACE requires a config file which contains every training case. +For training, PREFACE requires a config file. - Example: ```./examples/config.txt``` - Tab-separated file with at least four columns. - The name of these columns (passed as a header) must be 'ID', 'filepath', 'gender' and 'FF'. - - 'ID': a unique identifier should be given to each of the samples. - - 'filepath': the full absolute path of the copy number alteration files. - - 'gender': either 'M' (male) or 'F' (female), representing fetal gender. Twins/triplets/... can be included if they are all male or female. - - 'FF': the fetal fraction. One can use any formula/method he/she believes performs best at quantifying the actual fetal fraction (PREFACE was benchmarked according to the number of mapped Y-reads, referred to as FFY). This measurement will be ignored for female feti during the supervised learning phase, unless the `--femprop` flag is given (see below). + - 'ID' is used to specify a mandatory unique identifier to each of the samples. + - The 'filepath' column holds the full absolute path of the training copy number alteration files. + - The possible values for 'gender' are either 'M' (male) or 'F' (female), representing fetal gender. Twins/triplets/... can be included if they are all male or all female. + - The 'FF' column contains the response variable (the 'true' fetal fraction). One can use any method he/she believes performs best at quantifying the actual fetal fraction. PREFACE was benchmarked using the number of mapped Y-reads, referred to as FFY. As FFY is not informative for female fetuses, this measure is ignored for cases labeled with 'F', unless the `--femprop` flag is given (see below). ## Model training @@ -43,9 +42,9 @@ RScript PREFACE.R train --config path/to/config.txt --outdir path/to/dir/ [optio `--nfeat x` | Number of principal components to use during modeling. (default: x=50) `--hidden x` | Number of hidden layers used in neural network. Use with caution. (default: x=2) `--cpus x` | Use for multiprocessing, number of requested threads. (default: x=1) -`--femprop` | When using FFY as FF (recommended), FF labels for female feti are unrelevant, and should be ignored in the supervised learning phase (default). If this is not desired, use this flag, which claims that the given FFs for female feti are proportional to their actual FF. +`--femprop` | When using FFY as FF (recommended), FF labels for female fetuses are irrelevant, and should be ignored in the supervised learning phase (default). If this behavior is not desired, use this flag, which demands that the given FFs for female fetuses are proportional to their actual FF. `--olm` | It might be possible the neural network does not converge; or for your kind of data/sample size, an ordinary linear model might be a better option. In these cases, use this flag. -`--noskewcorrect` | This flag ascertains the best fit for most (instead of all) of the data is generated. +`--noskewcorrect` | This flag ascertains the best fit for most (instead of all) of the data is generated. Mostly not recommended. ## Predicting @@ -56,17 +55,17 @@ RScript PREFACE.R predict --infile path/to/infile.bed --model path/to/model.RDat
Optional argument

| Function :--- | :--- -`--json x` | Predictions are written to the stdout. Use this flag for json format. Optionally provide 'x' to generate .json file x. +`--json x` | Predictions are written to stdout. Use this flag for json format. Optionally provide 'x' to generate .json file x. -## Remarks +## Model optimization -- The most important parameter is `--nfeat` - - It represents the number of principal components that will be used as features during model training. Depending on the used copy number alteration software, bin size and number of training samples, it might have different optimal values. In general, I would recommend to train a model using the default parameters. The output will contain a plot that enables you to review the selected `--nfeat`. Two parts should be seen in the proportion of variance across the principal components (indexed in order of importance): - - A 'random' phase (representing PCs that explain variance caused by, inter alia, fetal fraction). - - A 'non-random' phase (representing PCs that explain variance caused by naturally occurring Gaussian noise). - - An optimal `--nfeat` captures the 'random' phase (as shown in the example at `./examples/overall_performance.png`). Capturing too much of the 'non-random' phase could lead to convergence problems during modeling. - - If you are not satisfied with the concordance of your model or with the position of `--nfeat`, re-run with a different number of features. -- Note that the resulting model (the one that is written to the output) will probably be a bit more accurate than what is claimed by the statistics file and figures. This is because PREFACE uses a cross-validation strategy where 10% of the (male) samples are excluded from training, after which these 10% serve as validation cases. This process is repeated 10 times. Therefore, the final performance measurements are based on models trained with only 90% of the (male) feti, yet the resulting model is trained with all provided cases. +- The most important parameter is `--nfeat`: + - It represents the number of principal components (PCs) that will be used as features during model training. Depending on the used copy number alteration software, bin size and the number of training samples, it might have different optimal values. In general, I recommend to train a model using the default parameters. The output will contain a plot that enables you to review the selected `--nfeat`. Two parts should be seen in the proportion of variance across the principal components (indexed in order of importance): + - A 'random' phase (representing PCs that explain variance caused by, inter alia, fetal fraction). + - A 'non-random' phase (representing PCs that explain variance caused by natural Gaussian noise). + - An optimal `--nfeat` captures the 'random' phase (as shown in the example at `./examples/overall_performance.png`). Capturing too much of the 'non-random' phase could lead to convergence problems during modeling. + - If you are not satisfied with the performance of your model or with the position of `--nfeat`, re-run with a different number of features. +- Note that the final model will probably be a bit more accurate than what is claimed by the performance statistics. This is because PREFACE uses a cross-validation strategy where 10% of the (male) samples are excluded from training, after which these 10% serve as validation cases. This process is repeated 10 times. Therefore, the final performance measurements are based on models trained with only 90% of the (male) fetuses, yet the resulting model is trained with all provided cases. # Required R packages @@ -77,8 +76,7 @@ RScript PREFACE.R predict --infile path/to/infile.bed --model path/to/model.RDat - data.table (v1.11.8) - MASS (v7.3-49) - -Other versions are of course expected to work equally well. To install within R use +Other versions are of course expected to work equally well. To install within R use: ```bash